# Healthcare Data Entities Identification

In [152]:
# Installing and importing relevant libraries
import glob
import spacy
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import pandas as pd
import pathlib
import os
from time import time
from spacy.tokenizer import Tokenizer

SENT_RANGE = 10 # Range of word to consider as features
NAME = 'N'
INSTITUTION = 'I'
OTHER = 'O'
SCHOOL_LIST = ['high', 'primary', 'intermediate', 'secondary']
INST_LIST = ["university", "college", "institute", "academy", "institution"]

# load the model
model = spacy.load("en_core_web_sm")
model.tokenizer = Tokenizer(model.vocab)

In [76]:
def get_token_loc(txt, ner_char_start, ner_end_char):
    loc = 0
    locations_list = []
    for ii, word in enumerate(txt.split()):
        word_len = len(word)
        if ner_char_start <= loc <= ner_end_char:
            locations_list.append(ii)
        loc += word_len + 1
        
    return locations_list

# Dataset prepration and overview

In [36]:
df_RE = pd.DataFrame()
# Load query data and extract relations
for file_name in glob.glob("./spike_queries/*/*.csv"):
    df_raw1 = pd.read_csv(file_name)
    if len(df_RE):
        df_RE = pd.concat([df_RE, df_raw1], ignore_index=True)
    else:    
        df_RE = df_raw1
df_RE_neg = df_RE[(df_RE['label'] == 0) &(df_RE['Yale_last_index'] != '?')]
df_RE_pos = df_RE[(df_RE['label'] == 1) &(df_RE['Yale_last_index'] != '?')]

print('Number of positive samples: ', len(df_RE_pos), ' Number of negative samples: ',len(df_RE_neg))

Number of positive samples:  200  Number of negative samples:  24


In [37]:
train_pos_len  = int(len(df_RE_pos) * 0.7)
train_neg_len  = int(len(df_RE_neg) * 0.7)

train1, test1 = df_RE_pos[:train_pos_len], df_RE_pos[train_pos_len:] 
train2, test2 = df_RE_neg[:train_neg_len], df_RE_neg[train_neg_len:]
train = pd.concat([train1, train2], ignore_index=True)
test = pd.concat([test1, test2], ignore_index=True)

In [47]:
def format_sentences(org_sentences):
    sentences_formatted = []
    annotations = []
    single_annot = 'O'
    for idx, row in org_sentences.iterrows():
        sentence_formatted = ''
        annotation = ''

        if str(row.label) == '1':
            for token_i, token in enumerate(row.sentence_text.split()):
                if int(row.John_first_index) <= token_i <= int(row.John_last_index):
                    single_anno = NAME
                elif int(row.Yale_first_index) <= token_i <= int(row.Yale_last_index):
                    single_anno = INSTITUTION
                else:
                    single_anno = OTHER

                sentence_formatted += token + ' '
                annotation += single_anno + ' '

        if str(row.label) == '0':
            for token_i, token in enumerate(row.sentence_text.split()):               
                sentence_formatted += token + ' '
                annotation += OTHER + ' '

        sentences_formatted.append(sentence_formatted.strip())
        annotations.append(annotation.strip())
        
    return sentences_formatted, annotations

In [48]:
train_sentences, train_labels = format_sentences(train)
test_sentences, test_labels = format_sentences(test)

In [50]:
train_count_total = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
train_count_o = [sent_lbl for sentence_labels in train_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(train_count_o)/len(train_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Train annotations counts'] = train_count_total
print('-------------------- Datasets statistics: --------------------\n')
print("'N' - Name")
print("'I' - Institution")
print("'O' - Other")
print('------------- Train dataset -------------')
print(df.value_counts())
print("NER tokens('N'/'I'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(train_sentences)))


test_count_total = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ']
test_count_o = [sent_lbl for sentence_labels in test_labels for sent_lbl in sentence_labels if sent_lbl != ' ' and sent_lbl == 'O']
O_percent = round(len(test_count_o)/len(test_count_total),3)
ner_percent = 1 - O_percent
df = pd.DataFrame()
df['Test annotations counts'] = test_count_total

print('\n------------- Test dataset -------------')
print(df.value_counts())
print("NER tokens('N'/'I'): {}%, 'O' tokens: {}%".format(round(ner_percent,4), O_percent))
print("Number of sentences: {}".format(len(test_sentences)))

-------------------- Datasets statistics: --------------------

'N' - Name
'I' - Institution
'O' - Other
------------- Train dataset -------------
Train annotations counts
O                           2546
I                            439
N                            186
dtype: int64
NER tokens('N'/'I'): 0.197%, 'O' tokens: 0.803%
Number of sentences: 156

------------- Test dataset -------------
Test annotations counts
O                          1195
I                           183
N                            89
dtype: int64
NER tokens('N'/'I'): 0.185%, 'O' tokens: 0.815%
Number of sentences: 68


# Defining features for CRF

## Select one out of two options:
1) 10 range word feature from left and right

2) Previous and next word features

3) Previous word features

In [143]:
def get_token_loc(txt, ner_char_start, ner_end_char):
    loc = 0
    locations_list = []
    for ii, word in enumerate(txt.split()):
        word_len = len(word)
        if ner_char_start <= loc <= ner_end_char:
            locations_list.append(ii)
        loc += word_len + 1
        
    return locations_list

def get_entities(sentence):
    sentence_org = ' '.join([token.orth_ for token in sentence])
    person_locs = []
    org_locs = []

    if sentence.ents:
        for ent in sentence.ents:
            if ent.label_ == 'PERSON':
                person_locs.extend(get_token_loc(sentence_org, ent.start_char, ent.end_char))
            if ent.label_ == 'ORG':
                org_locs.extend(get_token_loc(sentence_org, ent.start_char, ent.end_char))
                
    return person_locs, org_locs

In [174]:
# 1) Range of "SENT_RANGE" range of features
def getFeaturesForOneWord(cur_loc, sentence):
    person_locs, org_locs = get_entities(sentence)
    end_loc = len(sentence) - 1

    # Obtaining features for words
    features = []
    left_range = max(0, cur_loc - SENT_RANGE)
    right_range = min(end_loc, cur_loc + 10)

    for i_loc in range(left_range, right_range):
        word = sentence[i_loc]
        i = i_loc - cur_loc
        features.extend([
        f'word{i}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{i}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{i}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{i}.dep=' + word.dep_,                                             # dependency dependent
        f'word{i}.head=' + word.head.orth_,                                      # dependency head
        f'word{i}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{i}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{i}.person_ent={i_loc in person_locs}',                            # is this word part of person NER
        f'word{i}.inst_ent={i_loc in org_locs}',                                 # is this word part of institution NER
        f'word{i}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
        f'word{i}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind            
        f'word{i}.startsWithCapital={word.orth_[0].isupper()}'])                 # is the word starting with a capital letter
        
    if(cur_loc == 0):
        features.append('BEG')                                                   # feature to track begin of sentence 
 
    elif(cur_loc == end_loc - 1):
        features.append('END')                                                   # feature to track end of sentence
 
    return features

In [160]:
# 2) Previous, current and next word features
def getFeaturesForOneWord(cur_loc, sentence):
    person_locs, org_locs = get_entities(sentence)    
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]
 
    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter
    f'word{0}.person_ent={cur_loc in person_locs}',                            # is this word part of person NER
    f'word{0}.inst_ent={cur_loc in org_locs}',                                 # is this word part of institution NER
    f'word{0}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
    f'word{0}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind      
        
    if(cur_loc > 0):
        prev_loc = cur_loc - 1
        word = sentence[prev_loc]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        f'word{-1}.person_ent={prev_loc in person_locs}',                            # is this word part of person NER
        f'word{-1}.inst_ent={prev_loc in org_locs}',                                 # is this word part of institution NER
        f'word{-1}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
        f'word{-1}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind  
        
    else:
        features.append('BEG')                                          # feature to track begin of sentence 
 
    if(cur_loc + 1 < end_loc):
        next_loc = cur_loc + 1
        word = sentence[next_loc]
        features.extend([
        f'word{1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{1}.head=' + word.head.orth_,                                      # dependency head
        f'word{1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{1}.startsWithCapital={word.orth_[0].isupper()}'])                  # is the word starting with a capital letter
        f'word{1}.person_ent={next_loc in person_locs}',                            # is this word part of person NER
        f'word{1}.inst_ent={next_loc in org_locs}',                                 # is this word part of institution NER
        f'word{1}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
        f'word{1}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind     
    else:
        features.append('END')                                                # feature to track end of sentence

    return features

In [167]:
# 3) Previous and current word features
def getFeaturesForOneWord(cur_loc, sentence):
    end_loc = len(sentence) - 1
    # Obtaining features for current word
    word = sentence[cur_loc]

    features = [
    f'word{0}.lower=' + word.orth_.lower(),                                  # serves as word id
    f'word{0}.postag=' + word.pos_,                                          # PoS tag of current word
    f'word{0}[-3:]=' + word.orth_[-3:],                                      # last three characters
    f'word{0}.dep=' + word.dep_,                                             # dependency dependent
    f'word{0}.head=' + word.head.orth_,                                      # dependency head
    f'word{0}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
    f'word{0}.isdigit={word.orth_.isdigit()}',                               # is the word a number
    f'word{0}.startsWithCapital={word.orth_[0].isupper()}']                  # is the word starting with a capital letter
    f'word{0}.person_ent={cur_loc in person_locs}',                            # is this word part of person NER
    f'word{0}.inst_ent={cur_loc in org_locs}',                                 # is this word part of institution NER
    f'word{0}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
    f'word{0}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind   
        
    if(cur_loc > 0):
        prev_loc = cur_loc - 1
        word = sentence[prev_loc]
        features.extend([
        f'word{-1}.lower=' + word.orth_.lower(),                                  # serves as word id
        f'word{-1}.postag=' + word.pos_,                                          # PoS tag of current word
        f'word{-1}[-3:]=' + word.orth_[-3:],                                      # last three characters
        f'word{-1}.dep=' + word.dep_,                                             # dependency dependent
        f'word{-1}.head=' + word.head.orth_,                                      # dependency head
        f'word{-1}.isupper={word.orth_.isupper()}',                               # is the word in all uppercase
        f'word{-1}.isdigit={word.orth_.isdigit()}',                               # is the word a number
        f'word{-1}.startsWithCapital={word.orth_[0].isupper()}'])                # is the word starting with a capital letter
        f'word{-1}.person_ent={prev_loc in person_locs}',                            # is this word part of person NER
        f'word{-1}.inst_ent={prev_loc in org_locs}',                                 # is this word part of institution NER
        f'word{-1}.school={any([sub_w in word.orth_.lower() for sub_w in SCHOOL_LIST])}', # school ind
        f'word{-1}.ins={any([sub_w in word.orth_.lower() for sub_w in INST_LIST])}',      # institution ind          
    
    else:
        features.append('BEG')                                                # feature to track begin of sentence 

    if(cur_loc == end_loc):
        features.append('END')                                                # feature to track end of sentence

    return features

# Prepare data 

In [175]:
# Get features for a sentence.
def getFeaturesForOneSentence(sentence):
    sentence_parsing = model(sentence)
    return [getFeaturesForOneWord(ii, sentence_parsing) for ii,token in enumerate(sentence_parsing)]

# code to get the labels for a sentence.
def getLabelsInListForOneSentence(labels):
    return labels.split()

In [176]:
# Checking feature extraction
example_sentence = train_sentences[1]
doc = model(example_sentence)
print([(i, i.label_) for i in doc.ents])
print(f'Example sentence: "{example_sentence}"\n')

features = getFeaturesForOneSentence(example_sentence)
print('Total features in the sentence:', len(features))
print('Example of features for the word "rates":')
features[12]

[(New Haven, 'GPE'), (Connecticut, 'GPE'), (Mitchell, 'PERSON'), (Cheshire Academy, 'ORG'), (1863, 'DATE')]
Example sentence: "Born in New Haven , Connecticut , Mitchell was graduated from Cheshire Academy in 1863 ."

Total features in the sentence: 16
Example of features for the word "rates":


['word-10.lower=new',
 'word-10.postag=PROPN',
 'word-10[-3:]=New',
 'word-10.dep=compound',
 'word-10.head=Haven',
 'word-10.isupper=False',
 'word-10.isdigit=False',
 'word-10.person_ent=False',
 'word-10.inst_ent=False',
 'word-10.school=False',
 'word-10.ins=False',
 'word-10.startsWithCapital=True',
 'word-9.lower=haven',
 'word-9.postag=PROPN',
 'word-9[-3:]=ven',
 'word-9.dep=pobj',
 'word-9.head=in',
 'word-9.isupper=False',
 'word-9.isdigit=False',
 'word-9.person_ent=False',
 'word-9.inst_ent=False',
 'word-9.school=False',
 'word-9.ins=False',
 'word-9.startsWithCapital=True',
 'word-8.lower=,',
 'word-8.postag=PUNCT',
 'word-8[-3:]=,',
 'word-8.dep=punct',
 'word-8.head=Haven',
 'word-8.isupper=False',
 'word-8.isdigit=False',
 'word-8.person_ent=False',
 'word-8.inst_ent=False',
 'word-8.school=False',
 'word-8.ins=False',
 'word-8.startsWithCapital=False',
 'word-7.lower=connecticut',
 'word-7.postag=PROPN',
 'word-7[-3:]=cut',
 'word-7.dep=conj',
 'word-7.head=Haven',
 '

In [189]:
X_train = [getFeaturesForOneSentence(sentence) for sentence in train_sentences]
X_test = [getFeaturesForOneSentence(sentence) for sentence in test_sentences]

Y_train = [getLabelsInListForOneSentence(labels) for labels in train_labels]
Y_test = [getLabelsInListForOneSentence(labels) for labels in test_labels]

# build the CRF Classifier

In [191]:
def get_crf(X_train, Y_train):
    crf = sklearn_crfsuite.CRF(max_iterations=300)

    try:
        crf.fit(X_train, Y_train)
    except AttributeError:
        pass
    return crf

crf = get_crf(X_train, Y_train)

# Evaluation

In [179]:
Y_pred = crf.predict(X_test)

In [159]:
print('Using oprtion (1) - "Range of SENT_RANGE word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (1) - "Range of SENT_RANGE word features":
Weighted F1: 0.9653213629660603
Macro F1: 0.9248861689432877
Recall F1: 0.9229805085367687
Precision F1: 0.9303775276880901
Accuracy F1: 0.9652351738241309


In [166]:
print('Using oprtion (2) - "Previous, current and next word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (2) - "Previous, current and next word features":
Weighted F1: 0.9555766763664667
Macro F1: 0.9144131815126414
Recall F1: 0.9180163967822992
Precision F1: 0.9143847169808743
Accuracy F1: 0.9550102249488752


In [173]:
print('Using oprtion (3) - "Previous and current word features":')
print("Weighted F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='weighted')))
print("Macro F1: {}".format(metrics.flat_f1_score(Y_test, Y_pred, average='macro')))
print("Recall F1: {}".format(metrics.flat_recall_score(Y_test, Y_pred, average='macro')))
print("Precision F1: {}".format(metrics.flat_precision_score(Y_test, Y_pred, average='macro')))
print("Accuracy F1: {}".format(metrics.flat_accuracy_score(Y_test, Y_pred)))

Using oprtion (3) - "Previous and current word features":
Weighted F1: 0.9440871294983808
Macro F1: 0.8927344357876477
Recall F1: 0.9053001773710566
Precision F1: 0.8896958738860005
Accuracy F1: 0.9427402862985685


# Extract all relations

In [258]:
all_sentences = train_sentences + test_sentences
all_sentences_string = train_sentences + test_sentences
all_sentences = [i.split() for i in all_sentences]
all_labels = train_labels + test_labels
all_labels = [i.split() for i in all_labels]
condition_treatment_evidence = {'person':[], 'institution':[], 'evidence':[]}


for i in range(len(all_labels)):


        #print(test_sentences[i])
        cnt_disease = 0           # Count of number of diseases mentioned in the sentence
        cnt_treatment = 0         # Count of the number of treatments mentioned in the sentence
        diseases = [""]           # Initializing a blank list of diseases for current sentence.
        treatment = [""]          # Initializing a blank list of treatments for current sentence.
        sentence_number = [""]
        evidence = [""]
        
        length = len(all_labels[i])   # Length of current sentence.
        for j in range(length):
            if (all_labels[i][j] == 'N'):                                                     # Checking for label indicating disease for current word ('D')
                diseases[cnt_disease] += (all_sentences[i][j] + " ")            # Adding word to diseases list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'N'):                                           # Check for name of disease extending over multiple words. 
                        # If next word does not have label 'D', then truncate the space added at the end of the last word.
                        diseases[cnt_disease] = diseases[cnt_disease][:-1]
                        cnt_disease += 1
                        diseases.append("")                                               # Adding a placeholder for the next disease in the current sentence.
                else:
                    diseases[cnt_disease] = diseases[cnt_disease][:-1]
                    cnt_disease += 1
                    diseases.append("")
                                
            if (all_labels[i][j] == 'I'):                                                     # Checking for label indicating treatment for current word ('T')
                treatment[cnt_treatment] += (all_sentences[i][j] + " ") # Adding word to corresponding treatment list.
                if j < length - 1:
                    if (all_labels[i][j+1] != 'I'):                                           # Check for name of treatment extending over multiple words. 
                        # If next word does not have label 'T', then truncate the space added at the end of the last word.
                        treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                        cnt_treatment += 1
                        treatment.append("")                                              # Adding a placeholder for the next treatment in the current sentence.
                else:
                    treatment[cnt_treatment] = treatment[cnt_treatment][:-1]
                    cnt_treatment += 1
                    treatment.append("")

        diseases.pop(-1)    # Getting rid of the last empty placeholder in diseases list
        treatment.pop(-1)   # Getting rid of the last empty placeholder in treatments list
        if cnt_disease and cnt_treatment:
            for i_deases in range(cnt_disease):
                for j in range(cnt_treatment):             
                    condition_treatment_evidence['person'].append(diseases[i_deases])            
                    condition_treatment_evidence['institution'].append(treatment[j])
                    condition_treatment_evidence['evidence'].append(all_sentences_string[i])
  
# Create the pandas DataFrame
df_gold = pd.DataFrame(condition_treatment_evidence)

# Train on all the data and predicting on a new queries

In [185]:
def extract_new_relations(all_data_to_pred, all_predictions):
    cte = {'person':[], 'institution':[], 'evidence':[]}

    for sentence, preds in zip(data_to_pred1, Y1_pred):
        single_treatment = []
        single_condition = []
        sentence_treatments = []
        sentence_conditions = []
        cnt_N = 0;cnt_I = 0;cnt_O = 0;
        for word, pred in zip(sentence.split(), preds):
            if pred == NAME:
                cnt_N += 1; cnt_I = 0;
                single_condition.append(word)
                if cnt_I != 0:
                    sentence_treatments.append(' '.join(single_treatment))
                    single_treatment = []

            elif pred == INSTITUTION:
                cnt_I += 1; cnt_N = 0;
                single_treatment.append(word)
                if cnt_N != 0:
                    sentence_conditions.append(' '.join(single_condition))
                    single_condition = []                

            elif pred == 'O':            
                if cnt_I != 0:
                    sentence_treatments.append(' '.join(single_treatment))
                    single_treatment = []
                if cnt_N != 0:
                    sentence_conditions.append(' '.join(single_condition))
                    single_condition = []  

                cnt_N = 0; cnt_I = 0;

        if cnt_I != 0:
            sentence_treatments.append(' '.join(single_treatment))
            single_treatment = []
        if cnt_N != 0:
            sentence_conditions.append(' '.join(single_condition))
            single_condition = []  

        if len(sentence_conditions) and len(sentence_treatments):
            for t in sentence_treatments:
                for c in sentence_conditions:
                    cte['person'].append(c)
                    cte['institution'].append(t)
                    cte['evidence'].append(sentence)

    return pd.DataFrame(cte)

In [193]:
# Train model on all data
all_sentences = train_sentences + test_sentences
all_labels = train_labels + test_labels
all_sentences_f = [getFeaturesForOneSentence(sentence) for sentence in all_sentences]
all_labels_joined = [getLabelsInListForOneSentence(labels) for labels in all_labels]

crf = get_crf(all_sentences_f, all_labels_joined)

In [266]:
df_all_raw = pd.DataFrame()
# Load query data and extract relations
for file_name in glob.glob("spike_to_pred/*.csv"):
    df_raw1 = pd.read_csv(file_name)
    print(file_name, len(df_raw1))

spike_to_pred/bs_studied_in.csv 45065
spike_to_pred/bs_graduated_from.csv 45924
spike_to_pred/bs_graduated_at.csv 14140


In [267]:
#get unique values
df_all_raw = pd.DataFrame()
# Load query data and extract relations
for file_name in glob.glob("spike_to_pred/*.csv"):
    df_raw1 = pd.read_csv(file_name)
    df_all_raw = pd.concat([df_all_raw, df_raw1], ignore_index=True)
    
df_all_raw = pd.concat([df_all_raw, df_RE_neg[['sentence_id','sentence_text']]], ignore_index=True)
df_all_raw = pd.concat([df_all_raw, df_RE_pos[['sentence_id','sentence_text']]], ignore_index=True)
print("THe total length",len(df_all_raw))

df_uniq = pd.DataFrame()
df_uniq['sentence_text'] = df_all_raw.sentence_text.unique()

print("THe total unique length",len(df_uniq))

THe total length 105353
THe total unique length 94022


In [268]:
data_to_pred1 = df_uniq['sentence_text'].to_list()

X1_pred = [getFeaturesForOneSentence(sentence) for sentence in data_to_pred1]
Y1_pred = crf.predict(X1_pred)

df_relations = extract_new_relations(data_to_pred1, Y1_pred)
print('Number of extracted realtions: ',len(df_relations))
df_all_re = pd.concat([df_relations, df_gold], ignore_index=True)
print('Number of all realtions: ',len(df_all_re))
df_all_re.to_json(r'relation1.jsonl',orient = 'records', lines = 'True')

Number of extracted realtions:  29114
Number of all realtions:  29314
