### Imports

In [2]:
import pandas as pd

### Read STITCH DDI

In [3]:
file_path = '../../raw_data/stitch/chemical_chemical.links.detailed.v5.0.tsv/chemical_chemical.links.detailed.v5.0.tsv'

https://github.com/ChemGuy88/cpi/blob/master/Dataset%20Guide.md

Chemical ID prefix
The chemical IDs are the values that begin with CID(2). According to the STITCH README:

CIDs / CID0... - this is a stereo-specific compound, and the suffix is the PubChem compound id.

CIDm / CID1... - this is a "flat" compound, i.e. with merged stereo-isomers The suffix (without the leading "1") is the PubChem compound id.

Note that the download files contain the prefixes CIDs/CIDm, while the API still returns CID0/CID1.)

In [4]:
df = pd.read_csv(file_path, sep='\t')

In [5]:
df.tail()

Unnamed: 0,chemical1,chemical2,similarity,experimental,database,textmining,combined_score
17705813,CIDm00000001,CIDm87081431,753,0,0,173,173
17705814,CIDm00000001,CIDm90857042,0,0,0,218,218
17705815,CIDm00000001,CIDm91056687,0,0,0,294,294
17705816,CIDm00000001,CIDm91213096,0,0,0,357,357
17705817,CIDs00024759,CIDm00024759,0,0,900,0,900


In [6]:
# filter df to contain just chemical1 and chemical2 which matches regex CID\d{9}
import re

df = df[df['chemical1'].str.match('CIDs\d{8}') & df['chemical2'].str.match('CIDs\d{8}') ]
# match and replace s with 0 
df['chemical1'] = df['chemical1'].str.replace('s', '0')
df['chemical2'] = df['chemical2'].str.replace('s', '0')

In [7]:
print(df.shape)
df.head()

(9665844, 7)


Unnamed: 0,chemical1,chemical2,similarity,experimental,database,textmining,combined_score
1,CID091758695,CID000107694,0,0,0,230,230
2,CID091758695,CID011013287,0,0,0,230,230
3,CID091758695,CID011980957,0,0,0,328,328
4,CID091758695,CID000013078,0,0,0,162,162
5,CID091758695,CID000013109,0,0,0,468,468


### Read All SIDER DRUGS CID

In [8]:
df_all_se = pd.read_csv('../../prep_data/drug_all_se_pubchem.csv')
df_all_se.head(1)

Unnamed: 0,STITCH ID STEREO,MEDRA TERM UMLS CONCEPT ID,se_count,cid,mw,mf,polararea,complexity,xlogp,heavycnt,...,gpfamilycnt,neighbortype,meshheadings,annothits,annothitcnt,aids,cidcdate,sidsrcname,depcatg,annotation
0,CID000000119,"['C0002792', 'C0030193', 'C0151828', 'C0002994...",5,119,103.12,C4H9NO2,63.3,62.7,-3.2,7,...,13769,2D+3D,gamma-Aminobutyric Acid,Biological Test Results|Chemical and Physical ...,15,155|157|161|165|167|175|190|248|328|357|410|41...,20040916,001Chemical|3B Scientific (Wuhan) Corp|3WAY PH...,Chemical Vendors|Curation Efforts|Governmental...,"COVID-19, COVID19, Coronavirus, Corona-virus, ..."


In [9]:
all_drugs_cid = df_all_se['STITCH ID STEREO'].tolist()

In [10]:
assert len(all_drugs_cid) == len(set(all_drugs_cid))

### Filter STITCH DF to have only SIDER drugs

In [11]:
df = df[df['chemical1'].isin(all_drugs_cid) & df['chemical2'].isin(all_drugs_cid)]

In [12]:
df.head(1)

Unnamed: 0,chemical1,chemical2,similarity,experimental,database,textmining,combined_score
339566,CID071306834,CID010113978,0,0,0,153,153


In [13]:
cci_data = df[['chemical1', 'chemical2', 'combined_score']].values.tolist()
cci_data[:5]

[['CID071306834', 'CID010113978', 153],
 ['CID071306834', 'CID010324367', 411],
 ['CID071306834', 'CID000104799', 230],
 ['CID071306834', 'CID011707110', 170],
 ['CID071306834', 'CID000126941', 172]]

### Create DDI Network

In [14]:
import networkx as nx

# Step 1: Create a NetworkX graph from the CCI data
def create_ddi(cci_data):
    graph = nx.Graph()
    for d1, d2, score in cci_data:
        if score > 0:
            graph.add_edge(d1, d2, weight=score)
    return graph

cci_graph = create_ddi(cci_data)
    

### Step 2: Apply the Random Walk with Restart (RWR) algorithm

In [15]:
# Step 2: Perform Random Walk with Restart (RWR)
def random_walk_with_restart(graph, seed_node, personalized, restart_prob=0.8, epsilon=1e-6):
    personalized[seed_node] = 1.0
    probabilities = nx.pagerank(graph, alpha=restart_prob, personalization=personalized, tol=epsilon, weight='weight')
    return probabilities

In [16]:
# Step 3: Select negative samples based on probability threshold
def select_negative_samples(probabilities, threshold=2.74e-4):
    negative_samples = [node for node, prob in probabilities.items() if prob < threshold]
    # sort and give least probable nodes n_samples
    # negative_samples = sorted(probabilities, key=probabilities.get, reverse=False)[:n_samples]
    return negative_samples

### Generate Negative Samples for all DrugIDs

In [17]:
def generate_negative_samples_rwr(graph, seed_nodes, restart_prob=0.8, epsilon=1e-6):
    total_nodes = graph.number_of_nodes()
    personalized = {node: 1/total_nodes for node in graph.nodes()}
    negative_samples = {}
    for seed_node in seed_nodes:
        probabilities = random_walk_with_restart(graph, seed_node, personalized, restart_prob, epsilon)
        quantile_25 = pd.DataFrame(probabilities.values()).describe().iloc[4, 0]
        negative_samples[seed_node] = select_negative_samples(probabilities, quantile_25)
    return negative_samples

## multi-threaded version
from multiprocessing import Pool
from functools import partial

def generate_negative_samples_rwr_parallel(graph, seed_nodes, restart_prob=0.8, epsilon=1e-6, n_jobs=4):
    total_nodes = graph.number_of_nodes()
    personalized = {node: 1/total_nodes for node in graph.nodes()}
    negative_samples = {}
    func = partial(random_walk_with_restart, graph, personalized=personalized, restart_prob=restart_prob, epsilon=epsilon)
    with Pool(n_jobs) as pool:
        probabilities = pool.map(func, seed_nodes)
    for seed_node, prob in zip(seed_nodes, probabilities):
        quantile_25 = pd.DataFrame(prob.values()).describe().iloc[4, 0]
        negative_samples[seed_node] = select_negative_samples(prob, quantile_25)
    return negative_samples

In [18]:
# negative_samples = generate_negative_samples_rwr(cci_graph, all_drugs_cid)

In [19]:
negative_samples = generate_negative_samples_rwr_parallel(cci_graph, all_drugs_cid, n_jobs=6)

In [20]:
### dump negative samples
import pickle, os
dump_loc = '../../prep_data/stitch/negative_samples_rwr.pkl'
os.makedirs(os.path.dirname(dump_loc), exist_ok=True)
with open(dump_loc, 'wb') as f:
    pickle.dump(negative_samples, f)

In [90]:
len_ng = [len(v) for k, v in negative_samples.items()]

In [93]:
import numpy as np
min(len_ng), max(len_ng), np.mean(np.array(len_ng))

(323, 324, 323.999245852187)