In [1]:
import pandas as pd
import networkx as nx
from collections import Counter
from itertools import combinations, permutations, product

import warnings
warnings.filterwarnings('ignore')

In [27]:
# read in data

final_df = pd.read_csv("final_network/final_df.csv")

final_drug_drug = pd.read_csv("final_network/final_drug_drug.csv")

final_drug_se = pd.read_csv("final_network/final_drug_se.csv")

dates = pd.read_csv("final_network/drug-ade-date.csv")

In [31]:
dates = dates.dropna()

In [32]:
dates['year'] = dates['oldest date'].apply(lambda x: x.split(',')[0])

In [3]:
# create graphs

final_G = nx.from_pandas_edgelist(final_df, 'drug_name', 'target')
dd_final_G = nx.from_pandas_edgelist(final_drug_drug, 'drug_name', 'target')

# Calulate Similarity and Centrality Metrics

### jaccard coefficient, dice index, adar index, simpson index, geometric index

### absolute difference, product, and sum of degree centralities of corresponding drug and ADE vertices involved in each link

### average Jaccard similarity of the corresponding drug with all of the drugs connected to the ADE 

constructed and used a network including only the drugs (and no ADEs) and extracted Jaccard similarities of each drug with all of those connected drugs

### average distance from the corresponding drug to all of the drugs connected to the ADE (ie, the second derived variable)

For each drug - ade pair, calculate similarity metrics and above metrics then the output is a binary target variable which indicates if the association exists or not according to medline. 

In [4]:
# num unique drugs

print(len(final_drug_se.drug_name.unique()))

# num unique se

print(len(final_drug_se.target.unique()))


# total num possibilities

print(414*109)

# total num links

print(len(final_df))

414
109
45126
25159


In [5]:
# filter to only top 10 most common SEs

top_common_se = [c[0] for c in Counter(final_drug_se.target).most_common(10)]

final_drug_se = final_drug_se[final_drug_se["target"].isin(top_common_se)]
final_df = final_df[final_df["target"].isin(top_common_se)]

In [6]:
# # create graphs

# final_G = nx.from_pandas_edgelist(final_df, 'drug_name', 'target')
# dd_final_G = nx.from_pandas_edgelist(final_drug_drug, 'drug_name', 'target')

In [7]:
drug_col = []
se_col = []
link = []

for drug in final_drug_se.drug_name.unique():
    for side in final_drug_se.target.unique():
        drug_col.append(drug)
        se_col.append(side)
        if final_drug_se[(final_drug_se['drug_name'] == drug) & (final_drug_se['target'] == side)].empty:
            link.append(0)
        else:
            link.append(1)
        
        


In [8]:
df = pd.DataFrame({"drug": drug_col, "ade": se_col, "link": link})

In [9]:
# calculate jaccard coefficient, adamic adar coefficient, and pref attachement
# for every drug-ade pair

da_pairs = [(drug, ade) for drug, ade in zip(df.drug, df.ade)]

jpred = nx.jaccard_coefficient(final_G, da_pairs)
aindex = nx.adamic_adar_index(final_G, da_pairs)
pref = nx.preferential_attachment(final_G, da_pairs)

jaccard_coef = []
for u, v, p in jpred:
    jaccard_coef.append(p)
    
aa_index = []
for u, v, p in aindex:
    aa_index.append(p)
    
pref_index = []
for u, v, p in pref:
    pref_index.append(p)
    
    

In [10]:
df['jaccard'] = jaccard_coef
df['adamic_adar'] = aa_index
df['pref_attach'] = pref_index

In [11]:
# centrality calculations - degree, eigenvenctor, 

sum_deg = []
eig_deg = []
close_deg = []

deg_dict = nx.degree_centrality(final_G)
eig_dict = nx.eigenvector_centrality(final_G)
close_dict = nx.closeness_centrality(final_G)

for drug, target in zip(df.drug, df.ade):
    sum_deg.append(deg_dict[drug] + deg_dict[target])
    eig_deg.append(eig_dict[drug] + eig_dict[target])
    close_deg.append(close_dict[drug] + close_dict[target])
    

In [12]:
df['deg'] = sum_deg
df['eig'] = eig_deg
df['close'] = close_deg

In [13]:
df

Unnamed: 0,drug,ade,link,jaccard,adamic_adar,pref_attach,deg,eig,close
0,bupropion,Cardiac disorder,1,0.109649,5.795098,11310,0.483748,0.137383,1.095237
1,bupropion,Myocardial infarction,1,0.140541,5.884129,8874,0.403442,0.116769,1.058802
2,bupropion,Tachycardia,1,0.160772,11.707961,17574,0.690249,0.200405,1.190730
3,bupropion,Supraventricular tachycardia,1,0.098039,2.359099,3132,0.214149,0.076887,0.969717
4,bupropion,Myocardial ischaemia,0,0.095238,2.444771,3306,0.219885,0.077117,0.981269
...,...,...,...,...,...,...,...,...,...
4085,tenoxicam,Bradycardia,0,0.037037,2.233550,5655,0.428298,0.091460,1.015177
4086,tenoxicam,Cardiac failure,1,0.076271,2.525597,2842,0.242830,0.045026,0.942859
4087,tenoxicam,Cardiac arrest,0,0.020833,0.822007,3422,0.281071,0.061069,0.970909
4088,tenoxicam,Ventricular tachycardia,0,0.008333,0.286000,2668,0.231358,0.057804,0.940313


In [36]:
dates.head()

Unnamed: 0,Drug_Name,Target,earlist data,oldest date,year
0,bupropion,Cardiovascular disorder,"2022, 9, 18","2016, 6, 2",2016
2,bupropion,Cardiac disorder,"2022, 3, 12","2012, 1, 17",2012
3,bupropion,Myocardial infarction,"2021, 7, 27","2000, 9, 30",2000
4,bupropion,Tachycardia,"2022, 3, 12","2002, 10, 26",2002
5,bupropion,Supraventricular tachycardia,"2020, 1, 11","2008, 7, 29",2008


In [57]:
clf_df = pd.merge(df, dates, \
                  left_on = ['drug', 'ade'], \
                  right_on = ['Drug_Name', 'Target'], \
                  how='left')

In [58]:
clf_df = clf_df.drop(['Drug_Name', 'Target', 'earlist data', 'oldest date'], axis=1)

In [63]:
clf_df.year = pd.to_numeric(clf_df.year).fillna(0)

In [65]:
from sklearn.linear_model import LogisticRegression

In [66]:
train_X = clf_df[clf_df["year"] < 2015][["jaccard", 'adamic_adar', 'pref_attach', 'deg', 'eig', 'close']]
train_y = clf_df[clf_df["year"] < 2015]['link']

test_X = clf_df[clf_df["year"] >= 2015][["jaccard", 'adamic_adar', 'pref_attach', 'deg', 'eig', 'close']]
test_y = clf_df[clf_df["year"] >= 2015]['link']

In [67]:
train_X

Unnamed: 0,jaccard,adamic_adar,pref_attach,deg,eig,close
0,0.109649,5.795098,11310,0.483748,0.137383,1.095237
1,0.140541,5.884129,8874,0.403442,0.116769,1.058802
2,0.160772,11.707961,17574,0.690249,0.200405,1.190730
3,0.098039,2.359099,3132,0.214149,0.076887,0.969717
4,0.095238,2.444771,3306,0.219885,0.077117,0.981269
...,...,...,...,...,...,...
4085,0.037037,2.233550,5655,0.428298,0.091460,1.015177
4086,0.076271,2.525597,2842,0.242830,0.045026,0.942859
4087,0.020833,0.822007,3422,0.281071,0.061069,0.970909
4088,0.008333,0.286000,2668,0.231358,0.057804,0.940313


In [68]:
clf = LogisticRegression().fit(train_X, train_y)

In [69]:
clf.score(train_X, train_y)

0.7227926078028748

In [70]:
clf.score(test_X, test_y)

0.13917525773195877

In [73]:
Counter(train_y)

Counter({1: 1174, 0: 2722})

In [74]:
Counter(test_y)

Counter({1: 194})

In [None]:
se_groups = final_drug_se.groupby("target", as_index=False).agg({"drug_name": list})

dd_jaccard = []
  

for drug, target in zip(df.drug, df.ade):
    try:
        drug_list = se_groups[se_groups["target"] == target]["drug_name"].iloc[0]
        jacpred = nx.jaccard_coefficient(dd_final_G, list(permutations(drug_list, 2)))

        jaccard_coef = []
        for u, v, p in jacpred:
            if u == drug:
                jaccard_coef.append(p)

        dd_jaccard.append(sum(jaccard_coef))
    except:
        print(target)
    
    

Myocardial infarction


In [None]:
dd_jaccard

In [68]:
# derived values

# get jaccard similarity and distance between each drug and drugs connected via ade
dd_jaccard = []


for drug in 
# drug drug network
nx.jaccard_coeficients(dd_final_G, dd_pairs)

[0.9727729450218772,
 0.8659033489222169,
 1.095237423193,
 1.0588016318204998,
 1.1907295597484278,
 0.9697168092930543,
 0.95097275106846,
 1.0734372292395828,
 1.0370014378670827,
 1.1689293657950106,
 0.9594691667900355,
 1.0586097557298706,
 0.8858087558346558,
 0.8962643572712927,
 1.1541018922852984,
 0.8826643244242778,
 0.8243019391346776,
 0.9033953736402657,
 0.787153903234972,
 1.187492658544205,
 1.0797872846246257,
 0.8900064200120948,
 0.9598247115894174,
 1.0511539115545379,
 1.1466460481099656,
 1.0837403905392011,
 1.215668318467129,
 0.9946555680117557,
 1.006208119462154,
 1.0402318086806717,
 1.053481956708821,
 0.9689503543918451,
 1.1489740932642487,
 1.0582659522452955,
 0.9966702766636373,
 1.0591122055349897,
 1.1155936525735834,
 1.2110857891290112,
 1.0285162353232369,
 0.8685692807617565,
 0.8685692807617565,
 1.1033804152094322,
 0.9135995505969011,
 0.8812858359783904,
 1.0659050832020405,
 0.9813734808850646,
 0.970846832160569,
 1.0294692918295403,
 1.1

In [59]:
final_drug_se

Unnamed: 0,drug_name,target
0,bupropion,Cardiovascular disorder
1,bupropion,Cardiac flutter
2,bupropion,Cardiac disorder
3,bupropion,Myocardial infarction
4,bupropion,Tachycardia
...,...,...
2003,piroxicam,Myocardial infarction
2004,piroxicam,Tachycardia
2005,tenoxicam,Cardiac disorder
2006,tenoxicam,Cardiac failure
