# Analyzing the SIDER 4.1 data

+ http://sideeffects.embl.de/
+ http://thinklab.com/d/30#4

In [20]:
import csv
import gzip
import collections

In [21]:
import pandas as pd

## STITCH to DrugBank mapping utilities

In [22]:
def stitch_flat_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:]) - 1e8

def stitch_stereo_to_pubchem(cid):
    assert cid.startswith('CID')
    return int(cid[3:])

In [23]:
# Read DrugBank terms
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv'
drugbank_df = pd.read_table(url)[['drugbank_id', 'name']].rename(columns={'name': 'drugbank_name'})

# Pubchem to DrugBank mapping
url = 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/pubchem.tsv'
drugbank_map_df = pd.read_table(url)

In [24]:
drugbank_map_df

Unnamed: 0,drugbank_id,pubchem_id
0,DB00014,11980055
1,DB00014,11981235
2,DB00014,11982741
3,DB00014,16052011
4,DB00014,23581804
...,...,...
204704,DB09028,74070157
204705,DB09028,74834862
204706,DB09028,77513518
204707,DB09028,87355970


## meddra_freq.tsv.gz

In [25]:
freq_df = pd.read_table("../Data/Original/meddra_freq.tsv",header=None)
freq_df.columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'placebo',
    'frequency',
    'lower',
    'upper',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
freq_df

Unnamed: 0,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,placebo,frequency,lower,upper,meddra_type,umls_cui_from_meddra,side_effect_name
0,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,LLT,C0000737,Abdominal pain
1,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,,21%,0.21,0.21,PT,C0687713,Gastrointestinal pain
3,CID100000085,CID000010917,C0000737,,5%,0.05,0.05,LLT,C0000737,Abdominal pain
4,CID100000085,CID000010917,C0000737,,5%,0.05,0.05,PT,C0000737,Abdominal pain
...,...,...,...,...,...,...,...,...,...,...
291627,CID171306834,CID071306834,C2830004,,5%,0.00,0.05,PT,C2830004,Somnolence
291628,CID171306834,CID071306834,C2830004,,5%,0.05,0.05,LLT,C2830004,Somnolence
291629,CID171306834,CID071306834,C2830004,,5%,0.05,0.05,PT,C2830004,Somnolence
291630,CID171306834,CID071306834,C2830004,,9%,0.09,0.09,LLT,C2830004,Somnolence


## meddra_all_se.tsv.gz

In [35]:
se_df = pd.read_table("../Data/Original/meddra_all_se.tsv",header=None)
se_df.columns = [
    'stitch_id_flat',
    'stitch_id_sterio',
    'umls_cui_from_label',
    'meddra_type',
    'umls_cui_from_meddra',
    'side_effect_name',
]
se_df['pubchem_id'] = se_df.stitch_id_sterio.map(stitch_stereo_to_pubchem)
se_df = drugbank_map_df.merge(se_df)
se_df

Unnamed: 0,drugbank_id,pubchem_id,stitch_id_flat,stitch_id_sterio,umls_cui_from_label,meddra_type,umls_cui_from_meddra,side_effect_name
0,DB00014,47725,CID100047725,CID000047725,C0000737,LLT,C0000737,Abdominal pain
1,DB00014,47725,CID100047725,CID000047725,C0000737,PT,C0687713,Gastrointestinal pain
2,DB00014,47725,CID100047725,CID000047725,C0000737,PT,C0000737,Abdominal pain
3,DB00014,47725,CID100047725,CID000047725,C0002170,LLT,C0002170,Alopecia
4,DB00014,47725,CID100047725,CID000047725,C0002170,PT,C0002170,Alopecia
...,...,...,...,...,...,...,...,...
301158,DB09020,2391,CID100002391,CID000002391,C0948733,PT,C0159066,Abdominal rigidity
301159,DB09020,2391,CID100002391,CID000002391,C1321898,LLT,C1321898,Blood in stool
301160,DB09020,2391,CID100002391,CID000002391,C1321898,PT,C0018932,Haematochezia
301161,DB09020,2391,CID100002391,CID000002391,C2242737,LLT,C2242737,Anorectal discomfort


In [36]:
se_df = se_df[['drugbank_id','pubchem_id', 'umls_cui_from_meddra', 'side_effect_name']]
se_df = se_df.dropna()
se_df = se_df.drop_duplicates(['drugbank_id', 'umls_cui_from_meddra'])
se_df = drugbank_df.merge(se_df)
se_df = se_df.sort_values(['drugbank_name', 'side_effect_name'])
print(len(se_df))
se_df.to_csv("../Data/sider_data.csv",index=False)

153663


In [37]:
se_df

Unnamed: 0,drugbank_id,drugbank_name,pubchem_id,umls_cui_from_meddra,side_effect_name
146269,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",6013,C0000729,Abdominal cramps
146270,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",6013,C0000737,Abdominal pain
146524,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",6013,C0232492,Abdominal pain upper
146622,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",6013,C0740651,Abdominal symptom
146645,DB07768,"(10ALPHA,13ALPHA,14BETA,17ALPHA)-17-HYDROXYAND...",6013,C0877331,Abnormal clotting factor
...,...,...,...,...,...
138639,DB05738,vapitadine dihydrochloride,312,C0015230,Rash
138654,DB05738,vapitadine dihydrochloride,312,C0234233,Tenderness
138650,DB05738,vapitadine dihydrochloride,312,C0041582,Ulcer
138651,DB05738,vapitadine dihydrochloride,312,C0042487,Venous thrombosis
