In [None]:
import json
import csv
import numpy as np
import pandas as pd
import random
import fasttext
import math
import operator
import itertools
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from collections import Counter
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from joblib import Parallel, delayed
from statistics import mean
model = fasttext.load_model("BioWordVec_PubMed_MIMICIII_d200.bin")
# get embedding for hormones having aliases by adding the original and alias word embedding.
# Hormones having aliases have a "/" in between the two alias names
alias_embeddings = dict()
for hormone in hormone_src_tgt_genes.keys():
    if "/" in hormone:
        parts = hormone.split("/")
        w1 = model.get_word_vector(parts[0])
        w2 = model.get_word_vector(parts[1])
        alias_embeddings[hormone] = np.add(w1,w2)
        
# This is a list of genes that are associated with multiple hormones
dup_genes = []
with open('./BioEmbedS_TS/dataset/genes_associated_with_multiple_hormones.txt','r') as f:
    for line in f:
        dup_genes.append(line[:-1])
        
#a dict containing the hormone and the list of source and target genes associated with it in HGv1 database
with open('./BioEmbedS_TS/dataset/hgv1_hormone_src_tgt_genes.json') as json_file:
    hormone_src_tgt_genes = json.load(json_file)
        
#a dict containing the hormone and the list of genes associated with it in HGv1 database
with open('./BioEmbedS_dummy/dataset/hgv1_hormone_genes.json') as json_file:
    hormone_genes = json.load(json_file)

    
def transform_X_values(pairs):
    embeddings = []
    for item in pairs:
        if "/" in item[0]:
            np1 = alias_embeddings[item[0]]
        else:
            np1 = model.get_word_vector(item[0])
        np2 = model.get_word_vector(item[1].lower())
        embeddings.append(np.concatenate([np1,np2]))
    return np.array(embeddings)

def get_oversampled_train_data(src_train_data, tgt_train_data, dup_genes):
    hor_map = dict()
    train_marked = dict()
    X_train_smote = []
    y_train_smote = []
    src_eligible_genes = dict()
    src_duplicate_genes = dict()
    tgt_eligible_genes = dict()
    tgt_duplicate_genes = dict()
    cnt = 1
    # get the list of genes for each hormone after removing genes which are associated with multiple hormones
    # (src_eligible genes and tgt_eligible_genes).
    for hormone in src_train_data.keys():
        src_eligible_genes[hormone] = []
        src_duplicate_genes[hormone] = []
        for gene in src_train_data[hormone]:
            if gene in dup_genes:
                src_duplicate_genes[hormone].append(gene)
            else:
                src_eligible_genes[hormone].append(gene)

        tgt_eligible_genes[hormone] = []
        tgt_duplicate_genes[hormone] = []
        for gene in tgt_train_data[hormone]:
            if gene in dup_genes:
                tgt_duplicate_genes[hormone].append(gene)
            else:
                tgt_eligible_genes[hormone].append(gene)
        
        # Consider the source and target genes for every hormone as a different class for applying SMOTE
        # Consider a hormone only if it has atleast 3 eligible source and 3 eligible target genes (constraint to apply SMOTE) 
        # and mark this gene.
        if len(src_eligible_genes[hormone]) >= 3 and len(tgt_eligible_genes[hormone]) >= 3:
            train_marked[hormone] = 1
            hor_map[cnt] = hormone+'#source'
            for gene in src_eligible_genes[hormone]:
                X_train_smote.append(model.get_word_vector(gene))
                y_train_smote.append(cnt)
            cnt += 1
            hor_map[cnt] = hormone+'#target'
            for gene in tgt_eligible_genes[hormone]:
                X_train_smote.append(model.get_word_vector(gene))
                y_train_smote.append(cnt)
            cnt += 1
        else:
            train_marked[hormone] = 0
    
    # transform dataset using smote-tomek
    smote_strategy = SMOTETomek(smote=SMOTE(k_neighbors=2))
    X_dataset_oversampled, y_dataset_oversampled = smote_strategy.fit_resample(np.array(X_train_smote), np.array(y_train_smote))
    counter = Counter(y_dataset_oversampled)
    print(counter)
    oversampled_genes_per_hormone = dict()
    X_train = []
    
    # get the oversampled embeddings for the source and the target sets
    for hormone, embedding in zip(y_dataset_oversampled, X_dataset_oversampled):
        if "/" in hor_map[hormone]:
            w1 = alias_embeddings[hor_map[hormone].split("#")[0]]
        else:
            w1 = model.get_word_vector(hor_map[hormone].split("#")[0])

        if "source" in hor_map[hormone]:
            embedding = np.append(embedding,1)
            X_train.append(np.concatenate([w1,embedding]))
        if "target" in hor_map[hormone]:
            embedding = np.append(embedding,0)
            X_train.append(np.concatenate([w1,embedding]))

        if hor_map[hormone] in oversampled_genes_per_hormone:
            oversampled_genes_per_hormone[hor_map[hormone]].append(embedding)   
        else:
            oversampled_genes_per_hormone[hor_map[hormone]] = [embedding]
    
    # add back the genes associated with multiple hormones which were removed earlier.
    for hormone in oversampled_genes_per_hormone.keys():
        if "/" in hormone:
            w1 = alias_embeddings[hormone.split("#")[0]]
        else:
            w1 = model.get_word_vector(hormone.split("#")[0])
        for gene in src_duplicate_genes[hormone.split("#")[0]]:
            w2 = model.get_word_vector(gene)
            w2 = np.append(w2,1)
            X_train.append(np.concatenate([w1,w2]))
        for gene in tgt_duplicate_genes[hormone.split("#")[0]]:
            w2 = model.get_word_vector(gene)
            w2 = np.append(w2,0)
            X_train.append(np.concatenate([w1,w2]))

    X_train = np.array(X_train)
    y_train = X_train[:,-1]
    y_train = y_train.astype(np.int32)
    X_train = np.delete(X_train,-1,axis=1)

    print("Train shape")
    print(X_train.shape)
    print(y_train.shape)
    
    return X_train, y_train, train_marked

In [11]:
hormone_src_genes = dict()
hormone_tgt_genes = dict()

for hormone in hormone_src_tgt_genes.keys():
    hormone_src_genes[hormone] = []
    hormone_tgt_genes[hormone] = []
    for src_gene in hormone_src_tgt_genes[hormone]['source']:
        hormone_src_genes[hormone].append(src_gene)
    for tgt_gene in hormone_src_tgt_genes[hormone]['target']:
        hormone_tgt_genes[hormone].append(tgt_gene)
        
X_train, y_train, _train_marked = get_oversampled_train_data(hormone_src_genes, hormone_tgt_genes, dup_genes)

np.save('./BioEmbedS_TS/dataset/X_train_all_pairs.npy',X_train)
np.save('./BioEmbedS_TS/dataset/y_train_all_pairs.npy',y_train)

param = {}
param['C'] = 1.0
param['degree'] = 3
param['kernel'] = 'poly'
param['probability'] = True
classifier = SVC()
classifier.set_params(**param)
classifier.fit(X_train,y_train)
pickle.dump(classifier, open('./BioEmbedS_TS/models/bioembeds-ts_novel_predictions_model.sav', 'wb'))

In [27]:
all_positive_preds = []
with open('./BioEmbedS_dummy/all_novel_predictions.csv','rt')as f:
    data = csv.reader(f)
    next(data)
    for row in data:
        if row[4] == str(1) and _train_marked[row[0]] == 1:
            all_positive_preds.append((row[0],row[1]))    
            
X_test_all_genes = transform_X_values(all_positive_preds)
print(X_test_all_genes.shape)

y_pred_all_genes = classifier.predict(X_test_all_genes)
y_dec_func_all_genes = classifier.decision_function(X_test_all_genes)
y_proba_all_genes = classifier.predict_proba(X_test_all_genes)

NameError: name '_train_marked' is not defined

In [None]:
all_genes_results = []
for pair, y_pred, y_dec, y_prob in zip(all_positive_preds, y_pred_all_genes, y_dec_func_all_genes, y_proba_all_genes):
    if pair[1].lower() in hormone_genes[pair[0]]:
        inHgv1 = "Yes"
    else:
        inHgv1 = "No"
    if y_pred == 1:
        pred = 'source'
    else:
        pred = 'target'
    all_genes_results.append((pair[0], pair[1], inHgv1, pred, y_dec, y_prob[1]))
    
with open('./BioEmbedS_TS/bioembedsts_novel_predictions.csv', mode='w') as pred_file:
    writer = csv.writer(pred_file, delimiter=',')
    header = ['Hormone','Gene','Is pair in Hgv1?','Prediction','SVM score','SVM probability']
    writer.writerow(header)
    
    for row in all_genes_results:
        writer.writerow(row)
