# Collect Driver Genes from Various Sources

Collect the various driver genes from the various sources listed in the README.md, placing them into
the predifined json-schema outlined in "driversValidationSchemaLight.json". Start
at the top of the list in the README, and just collect...

When collecting, try to stay with a particular column naming

"gene_symbol","driver_type","pmid","source_name"

In [1]:
import pandas as pd
import os
import datetime
import numpy as np

In [2]:
import lxml.etree as ET
import codecs
import subprocess
from rdkit import Chem
import gzip
import requests
from functools import reduce

In [3]:
def combine_cols(series, colname):
    ret_val = np.nan
    for col in colname:
        if not pd.isnull(series[col]):
            ret_val = series[col]
            break
    return ret_val

In [4]:
#set data path with your own
data_path = "../../clinicalreporting/drivers/Data/"

In [146]:
all_genes = pd.read_pickle(data_path + "all_genes.pkl")

In [116]:
genes2uniprot = pd.read_pickle(data_path + "genes2uniprot.pkl")

In [147]:
all_genes['3236']

{'edges': {},
 'meta_information': {'alias_symbol': 'ERBB1',
  'date_approved_reserved': '1986-01-01',
  'date_modified': '2017-03-24',
  'date_symbol_changed': nan,
  'driver_information': [{'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Unknown',
    'pmid': '14993899',
    'source_name': 'Cosmic'},
   {'Core pathway': 'PI3K; RAS',
    'Process': 'Cell Survival',
    'driver_type': 'Oncogene',
    'pmid': '23539594',
    'source_name': 'Vogelstein'},
   {'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Oncogene',
    'pmid': '14681372',
    'source_name': 'Uniprot'},
   {'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Oncogene',
    'pmid': '25759023',
    'source_name': 'Rubio-Perez'}],
  'ensembl_gene_id': 'ENSG00000146648',
  'entrez_id': 1956.0,
  'gene_family': 'Erb-b2 receptor tyrosine kinases',
  'gene_symbol': 'EGFR',
  'hgnc_id': 3236,
  'location': '7p11.2',
  'locus_group': 'protein-coding gene',
  'locus_type': 'gene with protein pro

# Target stuff

In [27]:
drug_data_containers = {}

## DrugBank

In [28]:
DRUGBANK_DOWNLOAD = "http://www.drugbank.ca/releases/latest/downloads/all-full-database"
DRUGBANK_STRUCTURES_DOWNLOAD = "http://www.drugbank.ca/releases/latest/downloads/all-structures"
DRUGBANK_XREFS_DOWNLAOD = "http://www.drugbank.ca/releases/latest/downloads/all-drug-links"
DRUGBANK_USER = "c.schaerfe@gmail.com"
DRUGBANK_PASSWORD = "themirrorcracked"

In [29]:
DRUGBANK_FILE = os.path.join(data_path, "drugbank.xml.zip")
DRUGBANK_SDF_FILE = os.path.join(data_path, "drugbank_molecules.sdf.zip")
DRUGBANK_XREF_FILE = os.path.join(data_path, "drugbank_xrefs.csv.zip")

In [30]:
DRUGBANK_FILE

'../../clinicalreporting/drivers/Data/drugbank.xml.zip'

In [31]:
i = 1
exit_code_xml, exit_code_sdf, exit_code_xref = 1, 1, 1
while exit_code_xml + exit_code_sdf + exit_code_xref > 0 and i < 11:
    print("trying download now for round number {}".format(i))
    
    if exit_code_xml > 0:
        exit_code_xml = subprocess.call(['curl', '-L', '-H', 
                                         '"Accept: application/xml"', 
                                         '-o', DRUGBANK_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_DOWNLOAD])
 
        print("xml", exit_code_xml)
    if exit_code_sdf > 0:
        exit_code_sdf = subprocess.call(['curl', '-L', 
                                         '-o', DRUGBANK_SDF_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_STRUCTURES_DOWNLOAD])
        print("sdf", exit_code_sdf)
    if exit_code_xref > 0:
        exit_code_xref= subprocess.call(['curl', '-L',
                                         '-o', DRUGBANK_XREF_FILE,
                                         '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
                                         DRUGBANK_XREFS_DOWNLAOD])
 
        print("xref", exit_code_xref)
    i += 1
    

trying download now for round number 1
('xml', 0)
('sdf', 0)
('xref', 0)


In [33]:
if exit_code_xml == 0:
    exit_code_xml = subprocess.call("unzip -p " + DRUGBANK_FILE +" \*.xml | cat > " + DRUGBANK_FILE.replace(".zip", ""), shell=True)
if exit_code_sdf == 0:
    exit_code_sdf = subprocess.call("unzip -p " + DRUGBANK_SDF_FILE +" \*.sdf | cat > " + DRUGBANK_SDF_FILE.replace(".zip", ""), shell=True)
if exit_code_xref == 0:
    exit_code_xref = subprocess.call("unzip -p " + DRUGBANK_XREF_FILE +" \*.csv | cat > " + DRUGBANK_XREF_FILE.replace(".zip", ""), shell=True)

In [34]:
print(exit_code_sdf)

0


In [35]:
# i = 0
# exit_code_xml, exit_code_sdf, exit_code_xref = 1, 1, 1
# while exit_code_sdf  > 0 or i < 10:
#     if exit_code_sdf > 0:
#         exit_code_sdf = subprocess.call(['curl', '-L', 
#                                          '-o', DRUGBANK_SDF_FILE,
#                                          '-u', DRUGBANK_USER + ':' + DRUGBANK_PASSWORD,
#                                          DRUGBANK_STRUCTURES_DOWNLOAD])
#         if exit_code_sdf == 0:
#             exit_code_sdf = subprocess.call("unzip -p " + DRUGBANK_SDF_FILE +" \*.sdf | cat > " + DRUGBANK_SDF_FILE.replace(".zip", ""), shell=True)
#         print("sdf", exit_code_sdf)
#     i += 1
#     print("trying now for round number {}".format(i))

In [36]:
print( exit_code_xml, exit_code_sdf, exit_code_xref )

(0, 0, 0)


Reading the drug molecules

In [31]:
DRUGBANK_FILE = DRUGBANK_FILE.replace(".zip", "")
DRUGBANK_SDF_FILE = DRUGBANK_SDF_FILE.replace(".zip", "")
DRUGBANK_XREF_FILE = DRUGBANK_XREF_FILE.replace(".zip", "")

In [32]:
DRUGBANK_SDF_FILE

'../../clinicalreporting/drivers/Data/drugbank_molecules.sdf'

In [33]:
drug_smiles = {}
structures = Chem.SDMolSupplier(DRUGBANK_SDF_FILE)
for s in structures:
    if s is not None:
        db_id = s.GetPropsAsDict().get("DRUGBANK_ID")
        smi = Chem.MolToSmiles(s)
        drug_smiles[db_id] = smi
drugbank2smiles = pd.DataFrame(list(drug_smiles.items()), columns=["drugbank_id", "SMILES"])

In [34]:
drugbank2smiles

Unnamed: 0,drugbank_id,SMILES
0,DB12781,Cn1c(COc2ccc(CC3SC(=O)NC3=O)cc2)nc2ccccc2c1=O
1,DB12780,O=C(Oc1ccc2c(c1)CCCN2C(=O)C(Cl)Cl)c1ccco1
2,DB12783,NC(CO)C(O)=NNCc1ccc(O)c(O)c1O
3,DB12782,OC1(c2ncccn2)CC2CCC(C1)N2C(c1ccccc1Cl)c1ccccc1Cl
4,DB12785,CC(C)Oc1ccc(-c2nc(OC3CC4C(=O)NC5(C(=O)NS(=O)(=...
5,DB12784,CC(Nc1cc(-c2cnn(C)c2)cc(Nc2cnccn2)n1)c1ccc(F)cc1
6,DB12039,Oc1cc(O)c2c(c1)OC(c1ccc(O)c(O)c1)C(O)C2
7,DB12037,CCC1CC1(NC(=O)C1CC(Oc2cc(-c3csc(NC(C)C)n3)nc3c...
8,DB12036,C=CC(=O)N1CCC(Oc2nc(Nc3ccc(N4CCC(N5CCN(C)CC5)C...
9,DB12035,CON(C)Cc1ccc(O)c2c1CC1CC3C(N(C)C)C(O)=C(C(N)=O...


In [35]:
def extract_gene_info(target, drugbank_id):
    target_id = target.find('{http://www.drugbank.ca}id').text
    target_name = target.find('{http://www.drugbank.ca}name').text
    target_organism = target.find('{http://www.drugbank.ca}organism').text

    target_action = ""
    if target.find('{http://www.drugbank.ca}actions') is not None:
        target_action = "|".join([act.text for act in
                                  target.find('{http://www.drugbank.ca}actions').
                                  findall('{http://www.drugbank.ca}action')])
    target_known_action = target.find('{http://www.drugbank.ca}known-action').text

    gene_symbol = ""
    hgnc_id = ""
    if target.find('{http://www.drugbank.ca}polypeptide') is not None:
        gene_symbol = target.find('{http://www.drugbank.ca}polypeptide').\
            find('{http://www.drugbank.ca}gene-name').text
        if target.find('{http://www.drugbank.ca}polypeptide').\
                find('{http://www.drugbank.ca}external-identifiers') is not None:
            for external in target.find('{http://www.drugbank.ca}polypeptide').\
                    find('{http://www.drugbank.ca}external-identifiers').\
                    findall('{http://www.drugbank.ca}external-identifier'):
                if external.find('{http://www.drugbank.ca}resource').text ==\
                        "HUGO Gene Nomenclature Committee (HGNC)":
                    hgnc_id = external.find('{http://www.drugbank.ca}identifier').text

    refs = []
    try:
        if target.find('{http://www.drugbank.ca}references') is not None:
            if target.find('{http://www.drugbank.ca}references').\
                    find('{http://www.drugbank.ca}articles') is not None:
                for article in target.find('{http://www.drugbank.ca}references').\
                        find('{http://www.drugbank.ca}articles').\
                        findall('{http://www.drugbank.ca}article'):
                    pmid = article.find('{http://www.drugbank.ca}pubmed-id').text
                    if pmid is not None:
                        refs.append(pmid)
        refs = "|".join(refs)
    except:
        print drugbank_id
        print refs, len(refs)

    row = [drugbank_id, target_id, target_name, target_organism,
           target_action, target_known_action, gene_symbol, hgnc_id, refs]
    row = [r or "" for r in row]
    return row

In [36]:
DRUGBANK_FILE

'../../clinicalreporting/drivers/Data/drugbank.xml'

In [37]:
tree = ET.parse(DRUGBANK_FILE)
root = tree.getroot()
version = root.attrib.get('version')

drug_synonyms = []
drug_atc = {}
cancer_drugs = {}
lists = []
for record in root.iterfind('{http://www.drugbank.ca}drug'):
    if len(record.find('{http://www.drugbank.ca}drugbank-id')) == 1:
        drugbank_id = record.find('{http://www.drugbank.ca}drugbank-id').text
    else:
        drugbank_id = [x.text for x in record.findall('{http://www.drugbank.ca}drugbank-id')
                       if x.attrib.get('primary', 'false') == 'true'][0]
    
    create_date = record.attrib.get('created')
    update_date = record.attrib.get('updated')
    drug_type = record.attrib.get('type')

    drug_name = record.find('{http://www.drugbank.ca}name').text.lower()
    
    drug_synonyms += [(drugbank_id, syn.text.encode('utf-8').lower()) for syn in record.\
                       find('{http://www.drugbank.ca}synonyms').\
                       findall('{http://www.drugbank.ca}synonym')]
    
    groups = "|".join([group.text.encode('utf-8') for group in record.\
                           find('{http://www.drugbank.ca}groups').\
                           findall('{http://www.drugbank.ca}group')])
    
    atc_code_list = []
    is_cancer_drug = 0
    if record.find('{http://www.drugbank.ca}atc-codes') is not None:
        atc_code_list = []
        for atc_code in record.find('{http://www.drugbank.ca}atc-codes').\
                findall('{http://www.drugbank.ca}atc-code'):
            atc_code_list.append(atc_code.attrib.get('code'))
            if atc_code.attrib.get('code').startswith('L01'):
                is_cancer_drug = 1
                cancer_drugs[drugbank_id] = True
            else:
                cancer_drugs[drugbank_id] = False
        drug_atc[drugbank_id] = "|".join(atc_code_list) 
    
    drugbank_data = [drug_name, create_date, update_date, drug_type, is_cancer_drug, groups]
    # extract targets
    if record.find('{http://www.drugbank.ca}targets') is not None:
        for target in record.find('{http://www.drugbank.ca}targets').\
                findall('{http://www.drugbank.ca}target'):
            target_info = extract_gene_info(target, drugbank_id)
            row = target_info + ['target', version]
            lists.append(row + drugbank_data)
    # extract enzymes
    if record.find('{http://www.drugbank.ca}enzymes') is not None:
        for enzyme in record.find('{http://www.drugbank.ca}enzymes').\
                findall('{http://www.drugbank.ca}enzyme'):
            row = extract_gene_info(enzyme, drugbank_id) + ['enzyme', version]
            lists.append(row + drugbank_data)
    # extract carriers
    if record.find('{http://www.drugbank.ca}carriers') is not None:
        for carrier in record.find('{http://www.drugbank.ca}carriers').\
                findall('{http://www.drugbank.ca}carrier'):
            row = extract_gene_info(carrier, drugbank_id) + ['carrier', version]
            lists.append(row + drugbank_data)
    # extract transporters
    if record.find('{http://www.drugbank.ca}transporters') is not None:
        for transporter in record.find('{http://www.drugbank.ca}transporters').\
                findall('{http://www.drugbank.ca}transporter'):
            row = extract_gene_info(transporter, drugbank_id) + ['transporter', version]
            lists.append(row + drugbank_data)

cols = ['drugbank_id', 'target_id', 'target_name', 'target_organism',
        'target_action', 'target_known_action', 'gene_symbol',
        'hgnc_id', 'pmid', 'interaction_type', 'version', 'drug_name',
        'create_date', 'update_date', 'drug_type', 'is_cancer_drug', 'approval_status']
drug2gene_drugbank = pd.DataFrame(lists, columns=cols)

In [38]:
print(str(drug2gene_drugbank.ix[0].drug_name))

lepirudin


In [39]:
drug2gene_drugbank = drug2gene_drugbank.loc[drug2gene_drugbank.target_organism == 'Human']

In [40]:
drug2gene_drugbank["hgnc_id"] = pd.to_numeric(drug2gene_drugbank["hgnc_id"].str.replace("HGNC:", ""))

In [41]:
db_name = "DrugBank"
drug2gene_drugbank = drug2gene_drugbank.assign(source_name=db_name, download_date=datetime.date.today().strftime("%Y-%m-%d"))

In [42]:
drug_data_containers[db_name] = drug2gene_drugbank

In [43]:
drugbank_xrefs = pd.read_csv(DRUGBANK_XREF_FILE)
drugbank_xrefs = drugbank_xrefs[["DrugBank ID", "Name", "PubChem Compound ID", "PharmGKB ID", "TTD ID"]]
drugbank_xrefs["name"] = drugbank_xrefs["Name"].str.lower()

In [44]:
drugbank_synonyms = pd.DataFrame(drug_synonyms, columns=["DrugBank ID", "name"])
drugbank_synonyms = pd.concat([drugbank_xrefs[["DrugBank ID", "name"]], drugbank_synonyms])

In [46]:
drug2gene_drugbank.loc[drug2gene_drugbank.drug_name.isnull()]

Unnamed: 0,drugbank_id,target_id,target_name,target_organism,target_action,target_known_action,gene_symbol,hgnc_id,pmid,interaction_type,version,drug_name,create_date,update_date,drug_type,is_cancer_drug,approval_status,download_date,source_name


## TTD

In [47]:
TTD_TARGETS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/TTD_download.txt"
TTD_DRUGS_XREFS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/TTD_crossmatching.txt"
TTD_XREFS_DOWNLOAD = "http://bidd.nus.edu.sg/BIDD-Databases/TTD/download/TTD_uniprot_all.txt"
TTD_DRUGS_DOWNLOAD = "http://database.idrb.cqu.edu.cn/TTD/download/All.sdf"
TTD_SDF_FILE = data_path + "TTD_molecules.sdf"

Read TTD molecules

In [49]:
exit_code_ttd_sdf = 1
i = 0
while exit_code_ttd_sdf > 0 and i < 10:
    exit_code_ttd_sdf = subprocess.call("wget -O " + TTD_SDF_FILE + " " +  TTD_DRUGS_DOWNLOAD, shell=True)
    print(exit_code_ttd_sdf)
    i += 1

0


In [48]:
ttd2cid = {}
ttd_smiles = {}
structures = Chem.SDMolSupplier(TTD_SDF_FILE)
for s in structures:
    if s is not None:
        db_id  = s.GetProp("_Name")
        db_cid = s.GetPropsAsDict().get("PUBCHEM_COMPOUND_CID")
        ttd_smiles[db_id] = Chem.MolToSmiles(s)
        if db_cid is not None:
            ttd2cid[db_id] = db_cid
ttd2smiles = pd.DataFrame(list(ttd_smiles.items()), columns=["ttd_drug_id", 'SMILES'])

In [49]:
ttd_drugs = pd.read_csv(TTD_DRUGS_XREFS_DOWNLOAD, skiprows=12, sep="\t",
                        names=["ttd_drug_id", "ign", "interaction_type", "drug_name"])
ttd_drugs = ttd_drugs.loc[ttd_drugs.interaction_type == 'DrugName'][["ttd_drug_id", "drug_name"]]
ttd2uniprot = pd.read_table(TTD_XREFS_DOWNLOAD, skiprows=12)\
                .rename(columns={"TTD Target ID": "target_id", "Uniprot ID": "uniprot_id"})

ttd_targets = pd.read_csv(TTD_TARGETS_DOWNLOAD, skiprows=12, sep="\t",
                          names=["target_id", "edge_type", "target"])\
    .rename(columns={'ttd_id': 'target_id'})\
    .drop_duplicates()

ttd2drug = ttd_targets.loc[ttd_targets.edge_type == "Drug(s)"]
ttd2drug.rename(columns={"target": "drug_name"}, inplace=True)
ttd2drug = ttd2drug.merge(ttd_drugs)
ttd2drug["drug_name"] = ttd2drug["drug_name"].str.lower()


ttd2drug_subtype = ttd_targets.loc[ttd_targets.edge_type.isin(["Inhibitor", "Agonist", "Antagonist", "Modulator", "Binder"])]
ttd2drug_subtype.rename(columns={"edge_type": "target_action", "target": "drug_name"}, inplace=True)
ttd2drug_subtype["drug_name"] = ttd2drug_subtype["drug_name"].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [50]:
drug2gene_ttd = ttd2drug.merge(ttd2drug_subtype, left_on=["target_id", "drug_name"], right_on=["target_id", "drug_name"], how="outer")\
        .merge(ttd2smiles, left_on="ttd_drug_id", right_on="ttd_drug_id")\
        .merge(drugbank_xrefs[["DrugBank ID", "TTD ID"]], left_on="ttd_drug_id", right_on="TTD ID", how="left")\
        .merge(drugbank_xrefs[["DrugBank ID", "Name"]], left_on="drug_name", right_on="Name", how="left")\
        .merge(drugbank2smiles, left_on="SMILES", right_on="SMILES", how="left")\
        .merge(ttd2uniprot, left_on="target_id", right_on="target_id")\
        .drop_duplicates()

In [51]:
drug2gene_ttd["drugbank_id"] = drug2gene_ttd.apply(combine_cols, args=(["DrugBank ID_x", "DrugBank ID_y", 'drugbank_id'],), axis=1)
drug2gene_ttd = drug2gene_ttd[['drugbank_id', "drug_name", "target_id", "uniprot_id", "target_action", "ttd_drug_id"]]
drug2gene_ttd = drug2gene_ttd.dropna(subset=['drugbank_id'])

In [52]:
uniprot_split = pd.DataFrame([(i, x.split(" (")[0].strip()) for i, row in drug2gene_ttd.iterrows()
                              for x in row['uniprot_id'].split(';')], columns=['df_index', 'uniprot_id'])
uniprot_split.set_index('df_index', inplace=True)
drug2gene_ttd = drug2gene_ttd.join(uniprot_split, lsuffix="s")\
                             .drop("uniprot_ids", axis=1)\
                             .merge(genes2uniprot, left_on="uniprot_id", right_on="uniprot_ids")\
                             .drop("uniprot_ids", axis=1)

In [53]:
drug2gene_ttd = drug2gene_ttd.merge(genes[["hgnc_id", "gene_symbol"]], left_on="hgnc_id", right_on="hgnc_id",
                                    suffixes=["", "_dup"])

In [54]:
db_name = "TTD"
db_pmid = "PMC4702870"
interaction_type = 'target'

drug2gene_ttd = drug2gene_ttd.assign(source_name=db_name, pmid=db_pmid,
                                     interaction_type=interaction_type,
                                     download_date=datetime.date.today().strftime("%Y-%m-%d"))

In [56]:
drug_data_containers[db_name] = drug2gene_ttd

## IUPHAR

In [57]:
IUPHAR_INTERACTIONS_DOWNLOAD="http://www.guidetopharmacology.org/DATA/interactions.csv"
IUPHAR_HGNC_MAPPING_DOWNLOAD="http://www.guidetopharmacology.org/DATA/GtP_to_HGNC_mapping.csv"
IUPHAR_LIGANDS_DOWNLOAD="http://www.guidetopharmacology.org/DATA/ligands.csv"

In [58]:
# this section may take a while because it queries IUPHAR db links
iuphar_ligands = pd.read_csv(IUPHAR_LIGANDS_DOWNLOAD)
drug_ids = iuphar_ligands["Ligand id"].tolist()

xrefs_drugs = []
url_db_xlink_base = "http://www.guidetopharmacology.org/services/ligands/{}/databaseLinks"
missed = []
iuphar2drugbank = []
print(len(drug_ids), "ligands in IUPHAR.")

for i, x in enumerate(drug_ids):
    r = requests.get(url_db_xlink_base.format(x))
    if r.status_code == 200:
        if i % 500 == 0:
            print(i)
        drugbank_ids = [d["accession"] for d in r.json() if d['database'] == 'DrugBank Ligand']
        if len(drugbank_ids) > 0:
            for d in drugbank_ids:
                iuphar2drugbank.append([x, d])
        else:
            missed.append(x)

iuphar2drugbank = pd.DataFrame(iuphar2drugbank, columns=["iuphar_id", "drugbank_id"])
print(len(set(missed)), " ligands without drugbank id.")

(8900, 'ligands in IUPHAR.')
0
1000
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
8000
8500
(6667, ' ligands without drugbank id.')


In [59]:
iuphar2drugbank.to_csv(data_path + "\iuphar2drugbank.csv", sep="\t")

In [60]:
iuphar2hgnc = pd.read_csv(IUPHAR_HGNC_MAPPING_DOWNLOAD)

In [61]:
iuphar_interactions = pd.read_csv(IUPHAR_INTERACTIONS_DOWNLOAD)\
                        .rename(columns={'type': 'target_action',
                                         'action': 'target_action_detailed',
                                         'pubmed_id': 'pmid',
                                         'target': 'target_name',
                                         'target_species': 'target_organism',
                                         'target_gene_symbol': 'gene_symbol',
                                         'target_uniprot': 'uniprot_id',
                                         'ligand': 'drug_name',
                                         'target_id': 'iuphar_id'})

  interactivity=interactivity, compiler=compiler, result=result)


In [62]:
drug2gene_iuphar = iuphar_interactions.merge(iuphar2hgnc[['iuphar_id', 'hgnc_id']],
                                             left_on="iuphar_id", right_on="iuphar_id")\
                                      .merge(iuphar2drugbank,
                                             left_on="ligand_id", right_on="iuphar_id")

In [63]:
drug2gene_iuphar = drug2gene_iuphar.loc[drug2gene_iuphar.target_organism == 'Human']

In [64]:
drug2gene_iuphar = drug2gene_iuphar[[x for x in drug2gene_iuphar.columns
                                     if not x.startswith("target_ligand") and not x.endswith("_x")
                                     and not x.endswith("_y") and not "original" in x
                                     and not x in ['ligand_gene_symbol', 'ligand_species', 'ligand_pubchem_sid']]]

In [65]:
db_name = "IUPHAR"
interaction_type = 'target'
drug2gene_iuphar = drug2gene_iuphar.assign(source_name=db_name, interaction_type=interaction_type,
                                           download_date=datetime.date.today().strftime("%Y-%m-%d"))


In [66]:
drug_data_containers[db_name] = drug2gene_iuphar

## Santos dataset
this is the supplement from the following paper. A frequently maintained version should be available on Pharos, but downloading the relevant data would require downloading and extracting the database dump first.


In [67]:
SANTOS_SUPPLEMENT_DOWNLOAD="http://www.nature.com/nrd/journal/v16/n1/extref/nrd.2016.230-s2.xlsx"

In [68]:
rename_cols = {"ACCESSION": "uniprot_id",
               "ORGANISM": "target_species",
               "PROTEIN_NAME": "target_name",
               "PARENT_PREF_NAME": "drug_name",
#                                       "PARENT_PREF_NAME": "drug_name",
               "MECHANISM_OF_ACTION": "action_comment"}
drug2gene_santos = pd.read_excel(SANTOS_SUPPLEMENT_DOWNLOAD)\
                     .rename(columns=rename_cols)

In [69]:
drug2gene_santos = drug2gene_santos.loc[drug2gene_santos["target_species"] == "Homo sapiens"]
drug2gene_santos = drug2gene_santos.dropna(subset=["uniprot_id"])
drug2gene_santos["drug_name"] = drug2gene_santos["drug_name"].str.lower()

In [70]:
drug2gene_santos = drug2gene_santos.merge(drugbank_synonyms, left_on="drug_name", right_on="name")\
                                   .rename(columns={"DrugBank ID": "drugbank_id"})

  rlab = rizer.factorize(rk)


In [71]:
list(rename_cols.values())

['action_comment', 'drug_name', 'target_species', 'uniprot_id', 'target_name']

In [72]:
drug2gene_santos = drug2gene_santos[list(rename_cols.values()) + ["drugbank_id"]]\
    .merge(genes2uniprot, left_on="uniprot_id", right_on="uniprot_ids")\
    .drop("uniprot_ids", axis=1)\
    .merge(genes[["hgnc_id", "gene_symbol"]], left_on="hgnc_id", right_on="hgnc_id",
                  suffixes=["", "_dup"])


In [73]:
db_name = "Santos"
pmid = "27910877"
interaction_type = 'target'
drug2gene_santos = drug2gene_santos.assign(source_name=db_name, pmid=pmid,
                                           download_date=datetime.date.today().strftime("%Y-%m-%d"),
                                           interaction_type=interaction_type)

In [74]:
drug_data_containers[db_name] = drug2gene_santos

In [75]:
include_cols = [u'hgnc_id', u'gene_symbol', u'drugbank_id', u'drug_name',
                u'target_action', 'target_id', u'interaction_type',
                #u'target_action_detailed',
                #u'primary_target', 
                u'target_known_action',
                u'pmid', u'source_name', u'download_date' ]
drug_df_final = reduce(lambda left,right: pd.concat([left,right], ignore_index=True,
                                                    join='outer'), drug_data_containers.values())[include_cols]
drug_df_final = drug_df_final.replace('', np.nan)

In [76]:
drug_df_final = drug_df_final.merge(pd.DataFrame(list(drug_atc.items()), columns=["drugbank_id", "ATC_code"]))\
                             .merge(drug_data_containers["DrugBank"][["drugbank_id", "approval_status"]], how="outer")\
                             .merge(pd.DataFrame(list(cancer_drugs.items()), columns=["drugbank_id", "is_cancer_drug"]))\
                             .drop_duplicates()

In [77]:
drug_df_final

Unnamed: 0,hgnc_id,gene_symbol,drugbank_id,drug_name,target_action,target_id,interaction_type,target_known_action,pmid,source_name,download_date,ATC_code,approval_status,is_cancer_drug
0,3535.0,F2,DB00001,lepirudin,inhibitor,BE0000048,target,yes,10505536|10912644|11055889|11467439|11807012|1...,DrugBank,2017-07-22,B01AE02,approved,False
1,3535.0,F2,DB00001,lepirudin,,,target,,27910877,Santos,2017-07-22,B01AE02,approved,False
2,3535.0,F2,DB00001,lepirudin,Inhibitor,,target,,16363236,IUPHAR,2017-07-22,B01AE02,approved,False
3,3535.0,F2,DB00001,lepirudin,Inhibitor,TTDS00202,target,,PMC4702870,TTD,2017-07-22,B01AE02,approved,False
4,3236.0,EGFR,DB00002,cetuximab,antagonist,BE0000767,target,yes,10480573|10601294|10628369|11408594|11431346|1...,DrugBank,2017-07-22,L01XC06,approved,True
16,3620.0,FCGR3B,DB00002,cetuximab,,BE0000901,target,unknown,16336752,DrugBank,2017-07-22,L01XC06,approved,True
28,1246.0,C1R,DB00002,cetuximab,,BE0002093,target,unknown,17139284|17016423,DrugBank,2017-07-22,L01XC06,approved,True
40,1241.0,C1QA,DB00002,cetuximab,,BE0002094,target,unknown,17139284|17016423,DrugBank,2017-07-22,L01XC06,approved,True
52,1242.0,C1QB,DB00002,cetuximab,,BE0002095,target,unknown,17139284|17016423,DrugBank,2017-07-22,L01XC06,approved,True
64,1245.0,C1QC,DB00002,cetuximab,,BE0002096,target,unknown,17139284|17016423,DrugBank,2017-07-22,L01XC06,approved,True


In [None]:
# drug_df_final['drug_name'] = drug_df_final['drug_name'].to_string()

In [131]:
# drug_info = ['drug_name', 'drugbank_id', 'approval_status', 'ATC_code', 'is_cancer_drug']

In [150]:
group_cols = ["hgnc_id"]
exclude_cols = group_cols + ["gene_symbol"]
# drug_info = ['drug_name', 'approval_status', 'ATC_code', 'is_cancer_drug']
groups = drug_df_final.groupby(group_cols)
for n, g in groups:
    all_genes[str(int(n))]['meta_information']['drug_information'] = g[[c for c in g.columns if c not in exclude_cols]].to_dict('records')
    all_genes[str(int(n))]['nodes']['drug_score'] = g.groupby(['drug_name'])\
        .agg({'source_name': lambda x: len(x)})\
        .reset_index()\
        .rename(columns={'source_name': 'drug_evidence'})\
        .to_dict('records')
    all_genes[str(int(n))]['nodes']['is_cancer_drug_target'] = int(g['is_cancer_drug'].any())

# Dump json to file - for fun

In [151]:
all_genes['3236']

{'edges': {},
 'meta_information': {'alias_symbol': 'ERBB1',
  'date_approved_reserved': '1986-01-01',
  'date_modified': '2017-03-24',
  'date_symbol_changed': nan,
  'driver_information': [{'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Unknown',
    'pmid': '14993899',
    'source_name': 'Cosmic'},
   {'Core pathway': 'PI3K; RAS',
    'Process': 'Cell Survival',
    'driver_type': 'Oncogene',
    'pmid': '23539594',
    'source_name': 'Vogelstein'},
   {'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Oncogene',
    'pmid': '14681372',
    'source_name': 'Uniprot'},
   {'Core pathway': nan,
    'Process': nan,
    'driver_type': 'Oncogene',
    'pmid': '25759023',
    'source_name': 'Rubio-Perez'}],
  'drug_information': [{'ATC_code': 'L01XC06',
    'approval_status': 'approved',
    'download_date': '2017-07-22',
    'drug_name': 'cetuximab',
    'drugbank_id': 'DB00002',
    'interaction_type': 'target',
    'is_cancer_drug': True,
    'pmid': '10480573|1

In [138]:
import json

In [152]:
all_genes_json = json.dumps(all_genes.values(), indent=4, sort_keys=True)
# True and False are not json serializable --- e need to be e.g. string

In [153]:
# !pwd ..

In [154]:
with open("../driver_db_dump.json", 'w') as f:
    f.write(all_genes_json)

# MongoDB stuff

Install MongoDB: https://docs.mongodb.com/getting-started/shell/tutorial/install-mongodb-on-os-x/
You could just save everyting to a .json and import it at the command line: https://docs.mongodb.com/getting-started/shell/import-data/

Or,Use pymongo to place the documents into the database one by one. here is a link to a helpful tutorial: http://api.mongodb.com/python/current/tutorial.html

You can use ```conda install pymongo```

Although the already mentioned tutorial seems quicker, for completeness: https://docs.mongodb.com/getting-started/python/introduction/

### MongoDB Compass
It's a seperate install. Intall after you have mongoDB installed. You'll need a running instance of your db up and running. You can do this by calling ```mongod``` on the command line. Then open up Compass and you'll see all the databases that you currently have running. After running the code below, you won't see the changes immediatly in a running Compass session, you'll have to press the sync button somewhere on the top left hand side of the interface.

In [140]:
# need pymongo
import pymongo
from pymongo import MongoClient

In [155]:
client = MongoClient()
# you can call the database what you want; I chose "drivers"
db = client['drivers_new2']
# the collection will be called posts; change accordingly
posts = db.cancer_genes

# alright pop it into the database

In [156]:
posts.insert_many(all_genes.values())

<pymongo.results.InsertManyResult at 0x270ccbfa0>

In [None]:
# for i in range(len(df_scored)):
#     posts.insert_one(df_scored.iloc[i].to_dict())

# Check that it happened with example query

In [159]:
for post in posts.find({"meta_information": {"$elemMatch": {"gene_symbol": "BRAF"}}}):
    print(post)

In [161]:
posts.find({"meta_information": {"$elemMatch": {"gene_symbol": "BRAF"}}})

<pymongo.cursor.Cursor at 0x25c487610>

In [None]:
#post["cancer"]

In [163]:
found_targets = []
for post in posts.find({"drugs":  {"$elemMatch": {"drugbank_id": "DB00002"}}}):
    found_targets.append((post["gene_symbol"]))
    print(post)

In [None]:
#post["gene_symbol"]

In [None]:
#found_targets

In [None]:
#len(post["drugs"]), post["drugs"]