In [1]:
from sklearn import svm
import pandas as pd
import itertools
import utils
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from random import Random
from sklearn.metrics import classification_report
import load_data
import pickle

# Example from sklearn

In [2]:
X = [[0], [1], [2], [3]]
y = [0, 1, 2, 3]
clf = svm.SVC(decision_function_shape='ovr')
clf.fit(X, y)
clf.predict([[5]])

array([3])

# Parse Dia-Bete data
Get list of relations from Dia-Bete file. The 'Relation' column is in the following format: causes|prevents, the name of the relation that this entity is the target separated by '|'. The 'Relation ID' column is in this format: 1-1[1_2]|1-19[3_2], where the first two numbers tell the ID of the first token of the source entity and the two numbers in square brackets say the entity number of the source entity and the target entity (the current entity).

For each pair of words have a label if they are connected by a relation or not. Use word.isupper as the only feature for now

In [6]:
df = pd.read_csv('../../data/all_annotated_dataset_2022_01_27.csv', index_col=0)
file_name_list = list(df['File'].unique())
train_split = int(len(file_name_list) * 0.9)
print(train_split)
Random(42).shuffle(file_name_list)
train_files = file_name_list[:train_split]
test_files = file_name_list[train_split:]
X_train_text, y_train_text = load_data.load_relation_data_diabete(df, train_files, is_bert=True)
X_test_text, y_test_text = load_data.load_relation_data_diabete(df, test_files, is_bert=True)

505
main_resposta_0180.txt
nutrition_resposta_0219.txt
nutrition_resposta_0104.txt
nutrition_resposta_0103.txt
main_resposta_0266.txt
main_resposta_0148.txt
nutrition_resposta_0075.txt
main_resposta_0076.txt
main_resposta_0161.txt
main_resposta_0133.txt
main_resposta_0097.txt
main_resposta_0119.txt
nutrition_resposta_0157.txt
main_resposta_0267.txt
nutrition_resposta_0095.txt
nutrition_resposta_0212.txt
nutrition_resposta_0141.txt
main_resposta_0268.txt
main_resposta_0191.txt
main_resposta_0075.txt
nutrition_resposta_0112.txt
nutrition_resposta_0184.txt
main_resposta_0214.txt
nutrition_resposta_0246.txt
nutrition_resposta_0237.txt
nutrition_resposta_0010.txt
nutrition_resposta_0187.txt
nutrition_resposta_0211.txt
nutrition_resposta_0051.txt
main_resposta_0287.txt
nutrition_resposta_0233.txt
nutrition_resposta_0130.txt
nutrition_resposta_0063.txt
main_resposta_0252.txt
nutrition_resposta_0037.txt
nutrition_resposta_0139.txt
nutrition_resposta_0071.txt
main_resposta_0022.txt
main_respost

# Run SVM

In [14]:
vectorizer = TfidfVectorizer(analyzer = "word", binary = False, 
                             ngram_range=(3,3))
# vectorizer = CountVectorizer(analyzer = "word", binary = False, 
#                              ngram_range=(3,3))

In [15]:
print(f'Len X train: {len(X_train_text)}')
X_train = vectorizer.fit_transform(X_train_text)
y_train = [utils.relation2id[elem] for elem in y_train_text]
clf = svm.SVC(decision_function_shape='ovr')
clf.fit(X_train, y_train)

Len X train: 854


SVC()

In [16]:
print(f'Len X test: {len(X_test_text)}')
X_test = vectorizer.transform(X_test_text)
y_test = [utils.relation2id[elem] for elem in y_test_text]
y_pred = clf.predict(X_test)
labels = list(range(1, len(utils.RELATIONS)))
target_names = utils.RELATIONS[1:]
print(classification_report(y_test, y_pred, labels=labels, target_names=target_names))

Len X test: 117
              precision    recall  f1-score   support

      causes       0.00      0.00      0.00         6
    prevents       0.00      0.00      0.00         4
      treats       0.71      0.71      0.71        14
         has       0.85      1.00      0.92        86
   diagnoses       1.00      0.29      0.44         7
 complicates       0.00      0.00      0.00         0

   micro avg       0.84      0.84      0.84       117
   macro avg       0.43      0.33      0.35       117
weighted avg       0.77      0.84      0.79       117



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Create BERT RE data

In [7]:
df_train = load_data.create_bert_re_df(X_train_text, y_train_text)
df_test = load_data.create_bert_re_df(X_test_text, y_test_text)
# df_train = df_train[:150]
# df_test = df_test[:100]
df_train

Unnamed: 0,sents,relations,relations_id
0,"em relação a [E2]gelatina[/E2] , caso ela seja...","has(e1,e2)",4
1,"a [E1]glicada[/E1] é um exame chamado de "" hem...","diagnoses(e1,e2)",5
2,"a glicada é um exame chamado de "" [E1]hemoglob...","diagnoses(e1,e2)",5
3,"a glicada é um exame chamado de "" hemoglobina ...",Other,0
4,"a [E2]glicada[/E2] é um exame chamado de "" hem...",Other,0
...,...,...,...
1466,quem tem diabetes tem uma chance maior de ter ...,Other,0
1467,quem tem diabetes tem uma chance maior de ter ...,Other,0
1468,quem tem diabetes tem uma chance maior de ter ...,Other,0
1469,quem tem diabetes tem uma chance maior de ter ...,Other,0


In [8]:
df_test

Unnamed: 0,sents,relations,relations_id
0,a pele dos pés de pessoas com [E2]diabetes[/E2...,"prevents(e1,e2)",2
1,a pele dos pés de pessoas com diabetes costuma...,"prevents(e1,e2)",2
2,a pele dos pés de pessoas com diabetes costuma...,"prevents(e1,e2)",2
3,a pele dos pés de pessoas com [E1]diabetes[/E1...,Other,0
4,é recomendado que se calibre o [E1]glicosímetr...,"diagnoses(e1,e2)",5
...,...,...,...
191,níveis - que é o seu caso - podem levar a uma ...,"treats(e1,e2)",3
192,níveis - que é o seu caso - podem levar a uma ...,"treats(e1,e2)",3
193,níveis - que é o seu caso - podem levar a uma ...,Other,0
194,níveis - que é o seu caso - podem levar a uma ...,Other,0


In [9]:
pickle.dump(df_train, open('../../../BERT-Relation-Extraction/data/df_train_bete.pkl', 'wb'))
pickle.dump(df_test, open('../../../BERT-Relation-Extraction/data/df_test_bete.pkl', 'wb'))

# Parse eHealth data

In [10]:
ehealth_df_train = load_data.load_relation_data_ehealth('../../data/eHealthKD/original/ref/training/input_multilingual.json')
ehealth_df_test = load_data.load_relation_data_ehealth('../../data/eHealthKD/original/ref/develop/input_multilingual.json')
print(f'Len eHealth train: {len(ehealth_df_train)}')
print(f'Len eHealth test: {len(ehealth_df_test)}')
ehealth_df_train

Len eHealth train: 20571
Len eHealth test: 1658


Unnamed: 0,sents,relations,relations_id
0,La presencia del [E1]gen[/E1] de células falci...,"in-context(e1,e2)",6
1,La [E1]presencia[/E1] del [E2]gen de células f...,"subject(e1,e2)",9
2,La [E1]presencia[/E1] del gen de células falci...,"same-as(e1,e2)",13
3,La [E1]presencia[/E1] del [E2]gen[/E2] de célu...,"subject(e1,e2)",9
4,La [E1]presencia[/E1] del gen de células falci...,Other,0
...,...,...,...
20566,Durante la [E1]noche[/E1] el edificio de Venez...,Other,0
20567,Durante la [E1]noche[/E1] el edificio de Venez...,Other,0
20568,Durante la [E1]noche[/E1] el edificio de Venez...,Other,0
20569,Durante la [E1]noche[/E1] el edificio de Venez...,Other,0


In [11]:
pickle.dump(ehealth_df_train, open('../../../BERT-Relation-Extraction/data/df_train_ehealth.pkl', 'wb'))
pickle.dump(ehealth_df_test, open('../../../BERT-Relation-Extraction/data/df_test_ehealth.pkl', 'wb'))