In [28]:
import json
import csv
import numpy as np
import math
import fasttext
from sklearn.svm import SVC
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix

with open('./dataset/hgv1_source_hormone_gene_bins.json') as json_file:
    src_hormone_gene_bins = json.load(json_file)
    
with open('./dataset/hgv1_target_hormone_gene_bins.json') as json_file:
    tgt_hormone_gene_bins = json.load(json_file)
    
with open('./dataset/hgv1_hormone_src_tgt_genes.json') as json_file:
    hormone_src_tgt_genes = json.load(json_file)
    
model = fasttext.load_model("../BioWordVec_PubMed_MIMICIII_d200.bin")



In [29]:
def transform_X_values(data_dict,train_marked):
    embeddings = []
    for hormone in data_dict.keys():
        if train_marked[hormone] == 1:
            if "/" in hormone:
                np1 = alias_embeddings[hormone]
            else:
                np1 = model.get_word_vector(hormone)
            for gene in data_dict[hormone]:
                np2 = model.get_word_vector(gene)
                embeddings.append(np.concatenate([np1,np2]))
    return np.array(embeddings)

alias_embeddings = dict()
for hormone in hormone_src_tgt_genes.keys():
    if "/" in hormone:
        parts = hormone.split("/")
        w1 = model.get_word_vector(parts[0])
        w2 = model.get_word_vector(parts[1])
        alias_embeddings[hormone] = np.add(w1,w2)

In [30]:
for i in range(5):
    X_train = np.load('./dataset/X_train_val_fold_'+str(i)+'.npy')
    y_train = np.load('./dataset/y_train_val_fold_'+str(i)+'.npy')
    with open('./dataset/train_val_marking_fold_'+str(i)+'.json') as json_file:
        train_marked = json.load(json_file)
    classifier = SVC(kernel='poly',C=1.0,degree=3)
    classifier.fit(X_train,y_train)
    
    src_test_data = src_hormone_gene_bins[str(i)]
    tgt_test_data = tgt_hormone_gene_bins[str(i)]
    X_test_src = transform_X_values(src_test_data,train_marked)
    X_test_tgt = transform_X_values(tgt_test_data,train_marked)
    X_test = np.concatenate([X_test_src,X_test_tgt])
    y_test_src = np.ones((X_test_src.shape[0],), dtype=int)
    y_test_tgt = np.zeros((X_test_tgt.shape[0],), dtype=int)
    y_test = np.concatenate([y_test_src, y_test_tgt])
    
    y_pred_test = classifier.predict(X_test)
    y_dec_score_test = classifier.decision_function(X_test)
    
    print("Testing results: fold-"+str(i))
    print(cohen_kappa_score(y_test,y_pred_test))
    print(confusion_matrix(y_test, y_pred_test))
    print(classification_report(y_test, y_pred_test))
    print("ROC-AUC score: "+str(roc_auc_score(y_test, y_dec_score_test)))
    precision, recall, _ = precision_recall_curve(y_test, y_dec_score_test)
    print("PR-AUC score: "+str(auc(recall, precision)))

Testing results: fold-0
0.5378398236590742
[[154  21]
 [ 30  54]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86       175
           1       0.72      0.64      0.68        84

    accuracy                           0.80       259
   macro avg       0.78      0.76      0.77       259
weighted avg       0.80      0.80      0.80       259

ROC-AUC score: 0.8385034013605444
PR-AUC score: 0.7334788351750007
Testing results: fold-1
0.47919198055893075
[[155  19]
 [ 35  45]]
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       174
           1       0.70      0.56      0.62        80

    accuracy                           0.79       254
   macro avg       0.76      0.73      0.74       254
weighted avg       0.78      0.79      0.78       254

ROC-AUC score: 0.8297413793103449
PR-AUC score: 0.6621598244684517
Testing results: fold-2
0.4584511887470376
[[153  21]
 [ 35  44]]
             

In [16]:
for i in range(5):
    np_load_old = np.load
    np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)
    train_marked = np.load('./dataset/train_val_markings_fold_'+str(i)+'.npy')
    print(train_marked)
    np.load = np_load_old

{'aldosterone': 0, 'cortisol': 0, 'estrogen': 1, 'glucagon': 0, 'insulin': 1, 'luteinizing hormone': 0, 'progesterone': 1, 'prolactin': 0, 'prostaglandins': 1, 'somatostatin': 0, 'testosterone': 1, 'triiodothyronin t3': 0, 'thyroxine t4': 0, 'adrenocorticotropin': 0, 'vascular endothelial growth factor': 1, 'norepinephrine': 1, 'adiponectin': 1, 'adrenaline/epinephrine': 0, 'somatotrophin/growth hormone': 1, 'serotonin/5-hydroxytryptamine': 1, 'vitamin d/calciferol': 1, 'follicle-stimulating hormone/follitropin': 0}
{'aldosterone': 0, 'cortisol': 0, 'estrogen': 1, 'glucagon': 0, 'insulin': 1, 'luteinizing hormone': 0, 'progesterone': 1, 'prolactin': 0, 'prostaglandins': 1, 'somatostatin': 0, 'testosterone': 1, 'triiodothyronin t3': 0, 'thyroxine t4': 0, 'adrenocorticotropin': 0, 'vascular endothelial growth factor': 1, 'norepinephrine': 1, 'adiponectin': 0, 'adrenaline/epinephrine': 0, 'somatotrophin/growth hormone': 1, 'serotonin/5-hydroxytryptamine': 1, 'vitamin d/calciferol': 1, 'fo

In [24]:
train_marked = {'aldosterone': 0, 'cortisol': 1, 'estrogen': 1, 'glucagon': 0, 'insulin': 1, 'luteinizing hormone': 0, 'progesterone': 1, 'prolactin': 0, 'prostaglandins': 1, 'somatostatin': 0, 'testosterone': 1, 'triiodothyronin t3': 0, 'thyroxine t4': 0, 'adrenocorticotropin': 0, 'vascular endothelial growth factor': 1, 'norepinephrine': 1, 'adiponectin': 0, 'adrenaline/epinephrine': 0, 'somatotrophin/growth hormone': 1, 'serotonin/5-hydroxytryptamine': 1, 'vitamin d/calciferol': 1, 'follicle-stimulating hormone/follitropin': 0}
with open('./dataset/train_val_marking_fold_4.json', 'w') as outfile:
    json.dump(train_marked,outfile)
outfile.close()