In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_curve, roc_auc_score,recall_score,f1_score,precision_score
from sklearn.model_selection import train_test_split

In [2]:
os.chdir("C:/Users/lucie/OneDrive/Documents/Documents/ENSTA/2A/Pre-Travail/Documentation/Tache 1")

df_on = pd.read_csv("Data/adverse_reactions.csv")
df_off = pd.read_csv("Data/OFFSIDES.csv")
df_two = pd.read_csv('Data/TWOSIDES.csv', nrows=1000000)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df_two = pd.read_csv('Data/TWOSIDES.csv', nrows=500000)
df_two = pd.concat([df_two.iloc[:1000], df_two.iloc[1001:]], ignore_index=True)
df_two['drug_1_rxnorn_id'].astype(int)
df_two['drug_2_rxnorm_id'].astype(int)
df_two['condition_meddra_id'].astype(int)

0         10003239
1         10003239
2         10003239
3         10012735
4         10012735
            ...   
499994    10012735
499995    10010774
499996    10013946
499997    10000060
499998    10000081
Name: condition_meddra_id, Length: 499999, dtype: int32

In [3]:
#Preprocessing
df_off = pd.concat([df_off.iloc[:200000], df_off.iloc[200001:]], ignore_index=True)
df_two = pd.concat([df_two.iloc[:1000], df_two.iloc[1001:]], ignore_index=True)

df_off['drug_rxnorn_id'].astype(int)
df_off['condition_meddra_id'].astype(int)

df_two['drug_1_rxnorn_id'].astype(int)
df_two['drug_2_rxnorm_id'].astype(int)
df_two['condition_meddra_id'].astype(int)

0         10003239
1         10003239
2         10003239
3         10012735
4         10012735
            ...   
999994    10049079
999995    10034902
999996    10046996
999997    10003601
999998    10052471
Name: condition_meddra_id, Length: 999999, dtype: int32

In [4]:
#Build the frequences matrix for OffSIDES
# Create the frequence matrix using OffSIDES data
drugs_off = df_off['drug_rxnorn_id'].unique()
AE_off = df_off['condition_meddra_id'].unique()

F = np.zeros((len(drugs_off), len(AE_off)))
for k in tqdm(range(df_off.shape[0])):
    i = np.where(drugs_off == df_off['drug_rxnorn_id'][k])
    j = np.where(AE_off == df_off['condition_meddra_id'][k])
    F[i, j] = df_off['mean_reporting_frequency'][k]

100%|██████████| 3206557/3206557 [1:01:59<00:00, 862.05it/s]


In [60]:
#Build a list of name in Twosides
drugs_two=[]
drugs_two_names=[]
for i in range(df_two.shape[0]):
    drug1=df_two['drug_1_rxnorn_id'][i]
    drug2=df_two['drug_2_rxnorm_id'][i]
    drug1_name=df_two['drug_1_concept_name'][i]
    drug2_name=df_two['drug_2_concept_name'][i]
    if [drug1,drug2] not in drugs_two :
        drugs_two.append([drug1,drug2])
        drugs_two_names.append([drug1_name,drug2_name])


In [6]:
#Build the frequence matrix for TwoSIDES
drugs_two=[]
for i in range(df_two.shape[0]):
    drug1=df_two['drug_1_rxnorn_id'][i]
    drug2=df_two['drug_2_rxnorm_id'][i]
    if [drug1,drug2] not in drugs_two :
        drugs_two.append([drug1,drug2])

N_drugs_two = len(drugs_two)


F_valid = np.zeros((N_drugs_two, len(AE_off)))

for k in tqdm(range(df_two.shape[0])):
    i =0
    for l, sous_liste in enumerate(drugs_two):
        if sous_liste[0] == df_two['drug_1_rxnorn_id'][k] and sous_liste[1] == df_two['drug_2_rxnorm_id'][k]:
            i=l
    j = np.where(AE_off == df_two['condition_meddra_id'][k])
    if type(df_two['mean_reporting_frequency'][k]) != str:
        F_valid[i, j] = df_two['mean_reporting_frequency'][k]
    else:
        df_two['mean_reporting_frequency'][k] = float(df_two['mean_reporting_frequency'][k])
        F_valid[i, j] = df_two['mean_reporting_frequency'][k]



100%|██████████| 499999/499999 [2:52:15<00:00, 48.38it/s]    


In [7]:
#Building the response variable
#Build a function to generalize the model for a massive amount of AE

def build_response(AE="Hypertension"):
    #Get the meddra_id associate to the AE
    inter1=df_on['pt_meddra_id'][df_on['pt_meddra_term']==AE].unique()
    inter2=df_on['pt_meddra_id'][df_on['pt_meddra_term']==AE.lower()].unique()
    if len(inter1>0):
        med_id=inter1[0]
    elif len(inter2)>0:
        med_id=inter2[0]
    else:
        print('Error : no match for this AE')
    
    AE_drug = df_on['ingredients_rxcuis'][df_on['pt_meddra_id']==med_id]
    AE_drug=AE_drug[df_on['num_ingredients']==1]
    AE_drug=[int(AE) for AE in AE_drug]
    class_drugs = np.zeros(len(drugs_off))
    k = 0
    for doff in drugs_off:
        for drug in AE_drug :
            if doff in AE_drug:
                class_drugs[k] = 1
                break
        k += 1
    return([class_drugs,sum(class_drugs)])  #Return the response variable and the number of positive labels

In [73]:
#Build a fonction which train the model, get the different score and predict on TwoSIDES data

def full_training(AE):
    #First we get the best parameter for the model
    def trainmodel(C,X,Y):
        model=LogisticRegression(class_weight='balanced',warm_start=True,C=C)
        return(np.mean(cross_val_score(model,X,Y,cv=5,scoring='recall')))
    
    pen_C=np.linspace(10**-1,10,50)
    score=[]
    X=F
    Y=build_response(AE)[0]
    for i in range(50):
        score.append(trainmodel(C=pen_C[i],X=X,Y=Y))
    
    C=pen_C[score.index(max(score))]

    scores=[max(score)]
    model=LogisticRegression(class_weight='balanced',C=C)

    model.fit(X,Y)
    Y_pred=model.predict_proba(F_valid)
    scores.append(Y_pred)

    #Delete the positive labels with at least one drug known as responsible for febrile neutropenia in Onsides
    preds=scores[1]

    #Get the meddra_id associate to the AE
    inter1=df_on['pt_meddra_id'][df_on['pt_meddra_term']==AE].unique()
    inter2=df_on['pt_meddra_id'][df_on['pt_meddra_term']==AE.lower()].unique()
    if len(inter1>0):
        med_id=inter1[0]
    elif len(inter2)>0:
        med_id=inter2[0]
    else:
        print('Error : no match for this AE')
    AE_drug = df_on['ingredients_rxcuis'][df_on['pt_meddra_id']==med_id]
    AE_drug=AE_drug[df_on['num_ingredients']==1]
    AE_drug=[int(AE) for AE in AE_drug]

    for i in range(len(preds_neutro)):
        if preds[i][1]>0.5:    
            drug1=drugs_two[k][0]
            drug2=drugs_two[k][1]
            for drug in AE_drug :
                if (drug1 in AE_drug) or (drug2 in AE_drug):
                    preds[i][1]=0
                    preds[i][0]=1
                    break

    return(scores)
        

In [74]:
#Apply the full training for Febrile Neutropenia
neutro=full_training('Febrile neutropenia')

In [78]:
#Get the names of the most likely drug-drug interaction candidates 
sorted_probs_neutro=sorted([x[1] for x in neutro[1]],reverse=True)
indexes_associated_neutro = [i for i, x in sorted(enumerate(neutro[1]), key=lambda x: x[1][1])]
print([drugs_two_names[i] for i in indexes_associated_neutro])

[['Furosemide', 'digoxin antibodies Fab fragments'], ['Papaverine', 'Sodium Bicarbonate'], ['clopidogrel', 'Isoflurane'], ['Simvastatin', 'Mannitol'], ['Metoprolol', 'gadoteridol'], ['Glipizide', 'Aprotinin'], ['Cefazolin', 'Mannitol'], ['Oseltamivir', 'Quinine'], ['Cephalexin', 'Omega-3 Fatty Acids'], ['Papaverine', 'Levofloxacin'], ['Sulfamethoxazole', 'Flurazepam'], ['Metoprolol', 'Isoflurane'], ['cerivastatin', 'rofecoxib'], ['Miconazole', 'glimepiride'], ['pirbuterol', 'cyclobenzaprine'], ['Labetalol', 'Cefuroxime'], ['Prazosin', 'fluvastatin'], ['Folic Acid', 'gadoteridol'], ['Nitroglycerin', 'gadoteridol'], ['zaleplon', 'Insulin Lispro'], ['sitagliptin', 'Cefuroxime'], ['Enalapril', 'Amisulpride'], ['Bisoprolol', 'metaxalone'], ['Triazolam', 'Clonidine'], ['Methylprednisolone', 'esmolol'], ['heparin', 'Cefuroxime'], ['Epinephrine', 'Scopolamine'], ['cefpodoxime', 'Baclofen'], ['venlafaxine', 'linezolid'], ['ramelteon', 'Budesonide'], ['Midazolam', 'Carbidopa'], ['Thiamine', 'Dox