In [None]:
import pandas as pd
import numpy as np
interaction_path = "interactions.tsv"
warhead_path = "warhead.csv"

ligand_path = "ligands.csv"
chembl_path = "chembl.txt"
drug_bank_full_path = "drugbank_full.csv"
drug_bank_path = "drugbank.txt"
ncit_to_chebi_path = "NCIt-ChEBI_Mapping.txt"
rxcui_path = "rxcui.csv"

In [None]:
warhead_df = pd.read_csv(warhead_path, sep = ',', header = 0)
warhead_df

In [None]:
ligand_df = pd.read_csv(ligand_path, sep = ',', header = 1)
ligand_df

In [None]:
ncit_to_chebi_df = pd.read_csv(ncit_to_chebi_path, sep = '\t', header = None)
ncit_to_chebi_df

In [None]:
drug_bank_df = pd.read_csv(drug_bank_path, sep = '\t', header = 0)
drug_bank_df

In [None]:
interaction_df = pd.read_csv(interaction_path, sep = '\t', header = 2)
interaction_df

In [None]:
interaction_df[['source', 'source_id']] = interaction_df['drug_concept_id'].str.split(':', expand=True)

In [None]:
interaction_df

In [None]:
info_before_counts = interaction_df['source'].value_counts()
info_before_counts

In [None]:
chembl_ind = interaction_df[interaction_df['source'] == 'chembl'].index
chembl_df = interaction_df.iloc[chembl_ind]
print('chembl: ',chembl_df.shape)

rxcui_ind = interaction_df[interaction_df['source'] == 'rxcui'].index
rxcui_df = interaction_df.iloc[rxcui_ind]
print('rxcui: ',rxcui_df.shape)

ncit_ind = interaction_df[interaction_df['source'] == 'ncit'].index
ncit_df = interaction_df.iloc[ncit_ind]
print('ncit: ',ncit_df.shape)

drugbank_ind = interaction_df[interaction_df['source'] == 'drugbank'].index
drugbank_df = interaction_df.iloc[drugbank_ind]
print('drugbank: ',drugbank_df.shape)

ligand_ind = interaction_df[interaction_df['source'] == 'iuphar.ligand'].index
ligand_df = interaction_df.iloc[ligand_ind]
print('ligand: ',ligand_df.shape)

wikidata_ind = interaction_df[interaction_df['source'] == 'wikidata'].index
wikidata_df = interaction_df.iloc[wikidata_ind]
print('wikidata: ',wikidata_df.shape)

In [None]:
#chembl mapping
chembl_df['source_id'].to_csv('chembl_df.csv', index=False)
wikidata_df['source_id'].to_csv('wikidata_df.csv', index=False)
drugbank_df['source_id'].to_csv('drugbank_df.csv', index=False)
ligand_df['source_id'].to_csv('ligand_df.csv', index=False)

In [None]:
# by using pubchem auto converter
chembl_map = pd.read_csv('chembl_map.txt', header = 0, sep = '\t')
drugbank_map = pd.read_csv('drugbank_map.txt', header = 0, sep = '\t')
ligand_map = pd.read_csv('ligand_map.txt', header = 0, sep = '\t')
wikidata_map = pd.read_csv('wikidata_map.txt', header = 0, sep = '\t')

chembl_map = chembl_map.set_index(chembl_ind)
interaction_df.loc[chembl_ind, 'Pubid'] = chembl_map['Unnamed: 1']

wikidata_map = wikidata_map.set_index(wikidata_ind)
interaction_df.loc[wikidata_ind, 'Pubid'] = wikidata_map['Unnamed: 1']

drugbank_map = drugbank_map.set_index(drugbank_ind)
interaction_df.loc[drugbank_ind, 'Pubid'] = drugbank_map['Unnamed: 1']

ligand_map = ligand_map.set_index(ligand_ind)
interaction_df.loc[ligand_ind, 'Pubid'] = ligand_map['Unnamed: 1']

interaction_df[interaction_df['source']=='iuphar.ligand']
interaction_df['Pubid'].notna().sum()

In [None]:
#ncit
chebi_ind = interaction_df[interaction_df['source'] == 'ncit'].index
chebi_df = interaction_df.iloc[chebi_ind]

In [None]:
# Merge chebi_df with ncit_to_chebi_df
ncit_to_chebi_df= ncit_to_chebi_df.drop_duplicates(subset=ncit_to_chebi_df.columns[0])
ncit_chebi = pd.merge(chebi_df, ncit_to_chebi_df, left_on='source_id', right_on=ncit_to_chebi_df.columns[0], how='left')

# Split the values in column 1 and assign to new columns [2, 3]
ncit_chebi[['2', '3']] = ncit_chebi[1].str.split(':', expand=True)

# Fill NaN values in column 3 with -2
ncit_chebi['3'] = ncit_chebi['3'].fillna(-2)

# Read the mapping file
mapping = 'drug_links.csv'
db_map_df = pd.read_csv(mapping, sep=',', header=0)

# Extract relevant columns and fill NaN values with -1
chebi_map = db_map_df[['PubChem Compound ID', 'ChEBI ID']]
chebi_map = chebi_map.fillna(-1)

# Merge ncit_chebi with chebi_map based on column 3 and 'ChEBI ID'
chebi_pubid_re = pd.merge(ncit_chebi, chebi_map, left_on=ncit_chebi["3"].astype(int), right_on=chebi_map['ChEBI ID'].astype(int), how='left')
chebi_pubid_re = chebi_pubid_re.set_index(ncit_ind)

interaction_df.loc[chebi_ind, 'Pubid'] = chebi_pubid_re['PubChem Compound ID']
interaction_df['Pubid'].notna().sum()

In [None]:
db_full_df = pd.read_csv(drug_bank_full_path, header = 0, sep = ',')
chebi_map2 = pd.merge(ncit_chebi, db_full_df, left_on = ncit_chebi["3"].astype(int), right_on = 'ChEBI ID', how = 'left')
chebi_map2 = chebi_map2.set_index(ncit_ind)

mask0 = interaction_df.loc[ncit_ind, 'Pubid'].isna()  # Mask for NaN values
interaction_df.loc[ncit_ind[mask0], 'Pubid'] = chebi_map2.loc[mask0, 'PubChem Compound ID']
(interaction_df['Pubid'].dropna() != -1).sum()

In [None]:
# Pubchem
rxcui_df2 = pd.read_csv("rxcui_df.csv", sep = ',', header = None)
ncit_df2 = pd.read_csv("ncit_df.csv", sep = ',', header = None)

rxcui_map = pd.merge(rxcui_df, rxcui_df2, left_on=rxcui_df['source_id'].astype(int), right_on=rxcui_df2[0].astype(int), how='left')
ncit_map = pd.merge(ncit_df, ncit_df2, left_on='source_id', right_on=0, how='left')

ncit_map = ncit_map.set_index(ncit_ind)
rxcui_map = rxcui_map.set_index(rxcui_ind)

interaction_df.loc[rxcui_ind, 'Pubid'] = rxcui_map[1]
interaction_df.loc[ncit_ind, 'Pubid'] = ncit_map[1]
(interaction_df['Pubid'].dropna() != -1).sum()

In [None]:
#Drugbank
db_full_df = pd.read_csv(drug_bank_full_path, header = 0, sep = ',')
drugbank_map2 = pd.merge(drugbank_df, db_full_df, left_on = 'source_id', right_on = 'DrugBank ID', how = 'left')
drugbank_map2 = drugbank_map2.set_index(drugbank_ind)
interaction_df.loc[drugbank_ind, 'Pubid'] = drugbank_map2['PubChem Compound ID']
(interaction_df['Pubid'].dropna() != -1).sum()

In [None]:
# Github
mapping_full_df = pd.read_csv('drug-mappings.tsv', header=0, sep = '\t')
# pubchem_cid, drugbankId, chembl_id, chebi_id
mapping_full_df = mapping_full_df.drop_duplicates(subset = ['drugbankId'], keep='first')
interaction_df.loc[interaction_df['Pubid'] == -1, 'Pubid'] = None

db_map3 = pd.merge(drugbank_df, mapping_full_df, left_on = 'source_id', right_on = 'drugbankId', how = 'left')
db_map3 = db_map3.set_index(drugbank_ind)
# Update only rows with NaN in 'Pubid' for the given indices
mask = interaction_df.loc[drugbank_ind, 'Pubid'].isna()  # Mask for NaN values
interaction_df.loc[drugbank_ind[mask], 'Pubid'] = db_map3.loc[mask, 'pubchem_cid']

chembl_map2 = pd.merge(chembl_df, mapping_full_df, left_on = 'source_id', right_on = 'chembl_id', how = 'left')
chembl_map2 = chembl_map2.set_index(chembl_ind)
# Update only rows with NaN in 'Pubid' for the given indices
mask2 = interaction_df.loc[chembl_ind, 'Pubid'].isna()  # Mask for NaN values
interaction_df.loc[chembl_ind[mask2], 'Pubid'] = chembl_map2.loc[mask2, 'pubchem_cid']

mapping_full_df = mapping_full_df.drop_duplicates(subset = ['chebi_id'], keep='first')
chebi_map3 = pd.merge(ncit_chebi, mapping_full_df, left_on = ncit_chebi["3"], right_on = 'chebi_id', how = 'left')
chebi_map3 = chebi_map3.set_index(ncit_ind)
# Update only rows with NaN in 'Pubid' for the given indices
mask3 = interaction_df.loc[ncit_ind, 'Pubid'].isna()  # Mask for NaN values
interaction_df.loc[ncit_ind[mask3], 'Pubid'] = chebi_map3.loc[mask3, 'pubchem_cid']

(interaction_df['Pubid'].dropna() != -1).sum()

In [None]:
wikidata_df['source_id'].to_csv('wiki.csv', index=False)

# Manual edit
interaction_df.loc[interaction_df['source_id'] == 'Q6816906,', 'Pubid'] = None

In [None]:
interaction_df.loc[interaction_df['source'] == 'chemidplus', "Pubid"]

In [None]:
merged_df = pd.merge(warhead_df, interaction_df, how='inner', left_on= warhead_df['PubChem'].fillna(-4), right_on=interaction_df['Pubid'].fillna(-3))
merged_df.drop_duplicates(subset = ['Pubid'], keep='first')

In [None]:
merged_df.to_csv('result.csv')