# Import Driver gene tables

collect the various driver genes from the various sources, placing them into
the predifined json-schema outlined in "driversValidationSchemaLight.json". Start
at the top of the list in the README, and just collect...

When collecting, try to stay with a particular column naming

"gene_symbol","driver_type","pmid","source_name"

In [1]:
import pandas as pd

In [7]:
columns_to_db = ["hgnc_id", "gene_symbol", "driver_type", "pmid", "source_name"]
source_container = {}
data_path = "../../clinicalreporting/drivers/Data/"

In [8]:
"""
HGNC Genes
"""
HGNC_DOWNLOAD = "ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt"
UNIPROT_DOWNLOAD = "http://www.uniprot.org/uniprot/?query=reviewed:yes+AND+organism:9606&format=tab&compress=yes&columns=id,entry%20name,reviewed,protein%20names,genes,organism,length,sequence,ec,feature(ACTIVE%20SITE),feature(BINDING%20SITE),comment(TISSUE%20SPECIFICITY),go(biological%20process),go(molecular%20function),go(cellular%20component),go-id,comment(BIOTECHNOLOGY),comment(DISEASE),comment(PHARMACEUTICAL),feature(MUTAGENESIS),comment(DISRUPTION%20PHENOTYPE),last-modified,created,database(CCDS),database(EMBL),database(RefSeq),database(HGNC)"
genes = pd.read_csv(HGNC_DOWNLOAD, sep="\t").rename(columns={'symbol': 'gene_symbol'})
genes["hgnc_id"] = pd.to_numeric(genes["hgnc_id"].str.replace("HGNC:", ""))
gene_synonyms = [(s["hgnc_id"], x)
                 for i, s in genes[["hgnc_id", "alias_symbol"]].dropna(subset=["alias_symbol"]).iterrows()
                 for x in s["alias_symbol"].split("|") if x not in genes["gene_symbol"].tolist()]
gene_synonyms += [(s["hgnc_id"], x)
                  for i, s in genes[["hgnc_id", "prev_symbol"]].dropna(subset=["prev_symbol"]).iterrows()
                  for x in s["prev_symbol"].split("|") if x not in genes["gene_symbol"].tolist()]
gene_synonym_df = pd.concat([genes[["hgnc_id", "gene_symbol"]],
                             pd.DataFrame(gene_synonyms, columns=["hgnc_id", "gene_symbol"])])
all_genes = genes[['hgnc_id', 'gene_symbol', 'name', 'status', 'locus_group', 'locus_type',
                   'gene_family', 'alias_symbol', 'prev_symbol', 'location', 'uniprot_ids',
                   'entrez_id', 'ensembl_gene_id', 'date_approved_reserved', 'date_modified',
                   'date_symbol_changed']].set_index('hgnc_id', drop=False).to_dict('index')
genes2entrez = genes[["hgnc_id", "entrez_id"]].dropna()
genes2uniprot = genes[["hgnc_id", "uniprot_ids"]].dropna()

In [9]:
"""
Vogelstein et al.
"""
path_name = data_path + 'Vogelstein_CancerGenomeLandscape_Science_2013.csv'
vogelstein_df = pd.read_csv(path_name,delimiter=';')
vogelstein_pmid = '23539594'
vogelstein_name = "Vogelstein"
vogelstein_df = vogelstein_df.assign(pmid=vogelstein_pmid,source_name=vogelstein_name)
# correct names for later mergin'
vogelstein_df = vogelstein_df.rename(index=str,columns={"Gene Symbol":"gene_symbol","Classification*":"driver_type"})
vogelstein_df = vogelstein_df.merge(gene_synonym_df)
vogelstein_df = vogelstein_df[columns_to_db+["Process","Core pathway"]]


source_container[vogelstein_name]=vogelstein_df


"""
Rubio-Perez
"""

path_name = data_path + 'Drivers_type_role.tsv'
rubiop_df = pd.read_csv(path_name,delimiter='\t',comment='#')
rubiop_pmid = "25759023"
rubiop_name = "Rubio-Perez"
rubiop_df = rubiop_df.assign(pmid=rubiop_pmid,source_name=rubiop_name)
rubiop_df = rubiop_df.rename(index=str,columns={"geneHGNCsymbol":"gene_symbol","Driver_type":"driver_type"})
rubiop_df = rubiop_df.merge(gene_synonym_df)

mapper = dict(zip(rubiop_df['Role'].unique(),["Oncogene","TSG","Unknown","Oncogene"]))
rubiop_df["driver_type"] = rubiop_df["Role"].map(mapper)
# rubiop_df.loc[rubiop_df['Role']=='A']

rubiop_df = rubiop_df[columns_to_db+["OncodriveROLE_prob"]]
source_container[rubiop_name] = rubiop_df

"""
Uniprot
"""

oncogene_path_name = data_path + '/uniprot-keyword%3A%22Proto-oncogene+%5BKW-0656%5D%22.tab'
tsg_path_name = data_path + '/uniprot-keyword%3A%22Tumor+suppressor+%5BKW-0043%5D%22.tab'

uniprot_oncogene_df = pd.read_csv(oncogene_path_name,'\t')
uniprot_oncogene_df["driver_type"] = "Oncogene"
uniprot_tsg_df = pd.read_csv(tsg_path_name,'\t')
uniprot_tsg_df["driver_type"] = "TSG"

uniprot_df = pd.concat([uniprot_tsg_df,uniprot_oncogene_df],axis=0)
uniprot_df["gene_symbol"] = uniprot_df["Gene names"].apply(lambda x: x.split(" ")[0])
uniprot_df = uniprot_df.merge(genes2uniprot, left_on="Entry", right_on="uniprot_ids").drop("uniprot_ids", axis=1)
uniprot_name = "Uniprot"
uniprot_pmid = "14681372"
uniprot_df = uniprot_df.assign(pmid=uniprot_pmid,source_name=uniprot_name)
uniprot_df = uniprot_df[columns_to_db]

source_container[uniprot_name] = uniprot_df


"""
Cosmic Census

"""

path_name = data_path + 'Census_allTue Mar 14 14_33_17 2017.tsv'
cosmic_df = pd.read_csv(path_name,"\t")
cosmic_name = "Cosmic"
cosmic_pmid = "14993899"
cosmic_df = cosmic_df.assign(source_name=cosmic_name,pmid=cosmic_pmid)

# cosmic_df["Role in Cancer"].unique()
mapper = dict(zip(cosmic_df["Role in Cancer"].unique(),["TSG","Oncogene","Unknown","Oncogene/TSG"]))

cosmic_df["driver_type"] = cosmic_df["Role in Cancer"].map(mapper)
cosmic_df = cosmic_df.rename(index=str, columns={"Gene Symbol":"gene_symbol"})
cosmic_df = cosmic_df.merge(genes2entrez, left_on="Entrez GeneId", right_on="entrez_id").drop("entrez_id", axis=1)
cosmic_df = cosmic_df[columns_to_db]

source_container[cosmic_name] = cosmic_df

"""
TSgene

"""
tsgene_pathname = data_path + 'Human_TSGs.txt'
tsgene_df = pd.read_csv(tsgene_pathname,delimiter="\t")
# tsgene_df[:1]

tsgene_name = "TSgene"
tsgene_pmid = "23066107"

tsgene_df= tsgene_df.assign(source_name=tsgene_name,pmid=tsgene_pmid,driver_type="TSG")
tsgene_df = tsgene_df.rename(index=str,columns={"GeneSymbol":"gene_symbol"})
tsgene_df = tsgene_df.merge(genes2entrez, left_on="GeneID", right_on="entrez_id").drop("entrez_id", axis=1)
tsgene_df = tsgene_df[columns_to_db]

source_container[tsgene_name] = tsgene_df

"""
putting it all together
"""
df_final = reduce(lambda left,right: pd.concat([left,right],ignore_index=True), source_container.values())
df_final.drop_duplicates(inplace=True)
# this seems a little too hacked for my tast. Don't know, feel free to change it and let me know what you did
df_scored = pd.merge(df_final,df_final.groupby("gene_symbol").count()["pmid"].to_frame(),left_on="gene_symbol",right_index=True)
df_scored = df_scored.rename(index=str,columns=dict(zip(["pmid_x","pmid_y"],["pmid","score"])))

In [10]:
# do a little speed-validating
print df_scored["score"].unique()
print df_scored["driver_type"].unique()
print df_final["gene_symbol"].unique()

[1 4 2 3 5]
['TSG' 'Oncogene' 'Unknown' 'Oncogene/TSG']
['ABI1' 'ABL1' 'ABL2' ..., 'ZNF638' 'ZNF750' 'ZNF814']


# Take a look at the finished table

In [11]:
df_scored[:100]

Unnamed: 0,Core pathway,OncodriveROLE_prob,Process,driver_type,gene_symbol,hgnc_id,pmid,source_name,score
0,,,,TSG,ABI1,11320,14993899,Cosmic,1
1,,,,Oncogene,ABL1,76,14993899,Cosmic,4
614,Cell Cycle/Apoptosis,,Cell Survival,Oncogene,ABL1,76,23539594,Vogelstein,4
2136,,,,Oncogene,ABL1,76,14681372,Uniprot,4
2361,,,,Oncogene,ABL1,76,25759023,Rubio-Perez,4
2,,,,Oncogene,ABL2,77,14993899,Cosmic,2
2362,,0.811,,Oncogene,ABL2,77,25759023,Rubio-Perez,2
3,,,,Oncogene,ACKR3,23692,14993899,Cosmic,1
4,,,,Unknown,ACSL3,3570,14993899,Cosmic,2
2366,,,,Unknown,ACSL3,3570,25759023,Rubio-Perez,2


# Save for merging and posting into database with myDrug

In [12]:
group_cols = ["hgnc_id"]
exclude_cols = group_cols + ["gene_symbol"]
groups = df_scored.groupby(group_cols)
for n, g in groups:
    all_genes[int(n)]['cancer'] = g[[c for c in g.columns if c not in exclude_cols]].to_dict('records')

In [14]:
path_to_save = data_path + "driver_genes_DataFrame.pkl"
pd.to_pickle(df_scored,path_to_save)

In [15]:
path_to_save = data_path + "all_genes.pkl"
pd.to_pickle(all_genes,path_to_save)

In [16]:
path_to_save = data_path + "genes2uniprot.pkl"
pd.to_pickle(genes2uniprot,path_to_save)