# Preprocessing

## Uniprot data

### IDs

Map uniprot IDs with Hugo symbols

Download a tab separated file from https://www.uniprot.org/uniprot/?query=*&fil=organism%3A%22Homo+sapiens+%28Human%29+%5B9606%5D%22+AND+reviewed%3Ayes using only the following columns:

    Entry	Protein names	Cross-reference (GeneCards)	Cross-reference (CCDS)

Uncompress the file and rename it as ``uniprots_raw.tsv``

The data was downloaded on 03/07/2018

In [3]:
import pandas as pd

info_isoforms = pd.read_csv("external/uniprots_raw.tsv", sep="\t")

# For each isoform get its CCDS
l = []
d = set()
for index,row in info_isoforms.iterrows():
    entry = row["Entry"]
    try:
        hugo = row["Cross-reference (GeneCards)"][:-1]
    except:
        continue
    
    for ccds in str(row["Cross-reference (CCDS)"]).split(";"):
        ccds = ccds.strip()
        v = ccds.split(" ")
        if len(v) == 2:
            cds = v[0]
            entry_id = str(v[1])[1:-1]
            
        elif v[0] =="":
            cds = "-"
            entry_id = entry+"-1"
            if entry_id in d:
                continue
            
           
        else:
            cds = v[0]
            entry_id = entry+"-1"
        d.add(entry_id)
        l.append([entry,hugo,cds,entry_id])

df_uniprot = pd.DataFrame(l,columns=["Entry","Hugo_Symbol","CCDS","Entry_Isoform"])
df_uniprot.to_csv("data/uniprot_isoforms.tsv", sep="\t", index=False)

### Sequences

Download and uncompress the fasta files with the reference proteome of Eukaryota (files UP000005640_9606.fasta and UP000005640_9606_additional.fasta.gz)

Check that they have an annotate row in the mapping dataframe, if not do not include the sequence

*TODO find the right release*

In [None]:
import itertools
import pandas as pd
from Bio import SeqIO # Requires biopython

# For each sequence stores a row with the sequence and the info of the isoform
l_data =  []
for record in itertools.chain(SeqIO.parse('external/UP000005640_9606.fasta', "fasta"), \
                              SeqIO.parse('external/UP000005640_9606_additional.fasta', "fasta")):
    entry = record.id.split("|")[1]
    if "-" in entry:
        entry_isoform = entry
        entry = entry.split("-")[0]
    else:
        entry_isoform = entry + "-1"
    l_data.append([entry,entry_isoform, "".join(record.seq)])
    
df_seqs = pd.DataFrame(l_data, columns=["Entry","Entry_Isoform","Sequence"])
# Check those with uniprot ID in the dataframe of uniprots
df_uniprot = pd.read_csv('data/uniprot_isoforms.tsv', sep="\t")
df_seqs_info = pd.merge(df_seqs, df_uniprot)
df_seqs_info.to_csv('data/sequences_isoforms.tsv', sep="\t", index=False)

## TCGA data

### RPPA

Download level 4 cohort-specific RPPA data from https://tcpaportal.org/tcpa/download.html in a folder named ``tcga_rppa``  
Concatenate the data and merge COAD and READ

In [96]:
import os
import re
from zipfile import ZipFile

import pandas as pd


def read_file(path, name):
    with ZipFile(path) as zipfolder:
        with zipfolder.open('tmp/TCGA-{}-L4.csv'.format(name)) as f:
            return pd.read_csv(f, sep=',')


dfs = []

regex = re.compile('TCGA\-([A-Z]+)\-L4\.zip')

for file in os.listdir('external/tcga_rppa'):
    match = regex.match(file)
    if match:
        file_path = os.path.join('external/tcga_rppa', file)
        cancer_type = match.group(1)
        
        df = read_file(file_path, cancer_type)
        df["Matchable_Sample_ID"]=(df["Sample_ID"]).str[0:12]
        df.drop(columns=["Sample_Type","SetID","Sample_ID"],inplace=True)

        if cancer_type in ["COAD", "READ"]:
            cancer_type = "COADREAD"
            df["Cancer_Type"] = cancer_type
                
        dfs.append(df)
        
df = pd.concat(dfs, sort=True)
df.drop_duplicates(inplace=True)
df.to_csv('data/tcga_rppa.tsv', sep="\t", index=False)

print('Samples: ', len(df["Matchable_Sample_ID"].unique()))
print('Antibodies: ', len(df.columns.values)-2)

Samples:  7663
Antibodies:  258


### RNASeq

Download cohort specific RNASeq data from https://gdc.cancer.gov/about-data/publications/pancanatlas in a folder named ``tcga_rna``
Files required are: ``EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv`` and ``merged_sample_quality_annotations.tsv``

In [4]:
import numpy as np
import pandas as pd

def coadread_mapping(ctype):
    if ctype in ["COAD", "READ"]:
        return "COADREAD"
    else:
        return ctype
    
# Read antibodies and their associated hugo
hugos = pd.read_csv('internal/antibody_parsed.csv', sep=",")["Hugo_Symbol"].unique()

df_info_samples = pd.read_csv("external/tcga_rna/merged_sample_quality_annotations.tsv",sep="\t")
samples_filtered = df_info_samples[(df_info_samples["platform"].str.contains("_RNASeqV2"))&(~df_info_samples["Do_not_use"])&(~pd.isnull(df_info_samples["cancer type"]))][["aliquot_barcode","cancer type"]].drop_duplicates()

df_rna = pd.read_csv("external/tcga_rna/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv",sep="\t")
df_rna["Hugo_Symbol"] = df_rna.apply(lambda row: row["gene_id"].split("|")[0],axis=1)
df_rna = df_rna[df_rna["Hugo_Symbol"]!="?"]
df_rna.drop("gene_id",axis=1,inplace=True)

df_rna = df_rna.melt(id_vars="Hugo_Symbol",value_name="RSEM",var_name="aliquot_barcode").merge(samples_filtered,how="right")
df_rna = df_rna[df_rna['Hugo_Symbol'].isin(hugos)]
df_rna.rename(columns={"aliquot_barcode":"Tumor_Sample_Barcode", "cancer type": "Cancer_Type"},inplace=True)
df_rna["Cancer_Type"] = df_rna["Cancer_Type"].map(coadread_mapping)
df_rna["Matchable_Sample_ID"] = df_rna["Tumor_Sample_Barcode"].str[0:12]
df_rna.drop(columns=["Tumor_Sample_Barcode"],inplace=True)
df_rna=df_rna.groupby(["Hugo_Symbol","Matchable_Sample_ID","Cancer_Type"],as_index=False).agg({"RSEM":np.nanmean})
df_rna["log2(RSEM)"] = np.log2(df_rna["RSEM"] + 0.0001)

df_rna.to_csv('data/tcga_rna.tsv.gz', sep="\t", compression="gzip", index=False)

print('Samples: ', len(df_rna["Matchable_Sample_ID"].unique()))

Samples:  10087


### CNA

Download cohort specific CNA data from http://gdac.broadinstitute.org/runs/analyses__latest/data/ (version 2016_01_28) in a folder named ``tcga_cna``. Download the files ending in ``CopyNumber_Gistic2.Level_4.2016012800.0.0.tar.gz``

Limit to the same cancer types as the ones for RPPA data (skip COAD and READ and get only COADREAD)

In [22]:
import os
import tarfile
import pandas as pd

# Read antibodies and their associated hugo
hugos = [str(x) for x in list(pd.read_csv('internal/antibody_parsed.csv', sep=",")["Hugo_Symbol"].unique())]

# Read data
list_values = []
for file in os.listdir('external/tcga_cna/'):
    cancer_type = file.split('-')[0].replace('gdac.broadinstitute.org_', '')
    tar = tarfile.open("external/tcga_cna/{}".format(file), "r:gz")
    f = tar.extractfile('{}/all_thresholded.by_genes.txt'.format(file.replace('.tar.gz', '')))
    df = pd.read_csv(f, sep="\t", error_bad_lines=False)
    
    df_cna = pd.melt(df, id_vars=["Gene Symbol","Locus ID","Cytoband"], var_name="Tumor_Sample_Barcode", value_name="CNA")
    df_cna["Matchable_Sample_ID"] = df_cna["Tumor_Sample_Barcode"].str[0:12]
    df_cna["Cancer_Type"] = cancer_type
    df_cna.drop("Tumor_Sample_Barcode", axis=1, inplace=True)
    df_cna = df_cna[df_cna["Gene Symbol"].isin(hugos)]
    list_values.append(df_cna)

df_cna = pd.concat(list_values).drop_duplicates()
df_cna.rename(columns={"Gene Symbol":"Hugo_Symbol"},inplace=True)
df_cna.to_csv('data/tcga_cna.tsv.gz', compression="gzip", sep="\t")

print('Samples ', len(df_cna["Matchable_Sample_ID"].unique()))

Samples  10845


### Mutational data

Download cohort specific somatic mutations data from https://gdc.cancer.gov/about-data/publications/pancanatlas (``mc3.v0.2.8.PUBLIC.maf.gz``) with supplementary tables from https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables in a folder named ``tcga_mutations".

Required tables from suplementary material are:

- TCGA Study Abbreviations (``diseaseStudy.tsv``)
- Tissue Source Site Codes (``tissueSourceSite.tsv``)

The following filters are applied:
- only alterations that pass the filter "PASS"
- all alterations in the same gene are combined 
- Phenotype is selected from this order of priority
["Splice_Site","Nonsense_Mutation","Frame_Shift_Del","Frame_Shift_Ins","Nonstop_Mutation","Translation_Start_Syte","In_Frame_Del","In_Frame_Ins"
,"Missense_Mutation","Intron","Silent","5'UTR","3'UTR","IGR","5'Flank","3'Flank"]

In [100]:
import json
import pandas as pd

def coadread_mapping(ctype):
    if ctype in ["COAD", "READ"]:
        return "COADREAD"
    else:
        return ctype
    
def list_samples(group, dict_):
    dict_[group.name] = list(set(group['Matchable_Sample_ID'].unique()))
    
def get_protein_mutation(row):
    try:
        pos=int(row["Protein_position"])
    except ValueError:
        return "."
    wtaa=row["Amino_acids"].split("/")[0]
    if "/" in row["Amino_acids"]:
        mtaa = row["Amino_acids"].split("/")[1]
    else:
        mtaa = wtaa
    return wtaa+str(pos)+mtaa

def concat(grp):
    l = list(grp)
    return ",".join([str(e) for e in list(grp)])

def select_phenotype(row):
    
    values = row["Variant_Classification"].split(",")
    # List of prioritys from the TCGA data
    list_priority = ["Splice_Site","Nonsense_Mutation","Frame_Shift_Del","Frame_Shift_Ins","Nonstop_Mutation","Translation_Start_Syte","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Intron","Silent","5'UTR","3'UTR","IGR","5'Flank","3'Flank"]
                     
    for priority in list_priority:
        if priority in values:
            return priority
    return "-"


# Load info
df_info_samples = pd.read_csv("external/tcga_mutations/tissueSourceSite.tsv", sep="\t", keep_default_na=False)
df_ttypes_names = pd.read_csv("external/tcga_mutations/diseaseStudy.tsv",sep="\t")
df_info_samples = df_info_samples.merge(df_ttypes_names, on=["Study Name"])
df_info_samples.rename(columns={"Study Abbreviation":"Cancer_Type"},inplace=True)

samples_filtered = df_info_samples[["TSS Code","Cancer_Type"]].drop_duplicates()
d_ttype = dict(zip(samples_filtered["TSS Code"], samples_filtered["Cancer_Type"]))

# Load mutations
df_muts = pd.read_csv("external/tcga_mutations/mc3.v0.2.8.PUBLIC.maf.gz", sep="\t", low_memory=False)
df_muts["Cancer_Type"] = df_muts.apply(lambda row: d_ttype[row["Tumor_Sample_Barcode"].split("-")[1]], axis=1)

df_muts["Matchable_Sample_ID"] = df_muts["Tumor_Sample_Barcode"].str[0:12]

# list distinc samples IDs per cancer type
# TODO needed?
# TODO merge COADREAD before?, filter PASS before?
d_total_mutations = {}
df_muts.groupby("Cancer_Type").apply(list_samples, dict_=d_total_mutations)
with open("data/tcga_muts_samples.json", 'w') as fd:
    json.dump(d_total_mutations, fd)

df_muts = df_muts[df_muts["FILTER"]=="PASS"]
df_muts["Cancer_Type"] = df_muts["Cancer_Type"].map(coadread_mapping)

df_muts["protein_mutation"] = df_muts.apply(lambda row: get_protein_mutation(row),axis=1)
    
    
df_muts = df_muts[["Matchable_Sample_ID","Hugo_Symbol","Cancer_Type","Chromosome","Start_Position","End_Position","Strand","Reference_Allele","Tumor_Seq_Allele2","Protein_position","protein_mutation","Variant_Classification","CCDS","Variant_Type"]].drop_duplicates()
df_muts = df_muts.groupby(["Hugo_Symbol","Matchable_Sample_ID","CCDS","Cancer_Type"],as_index=False).agg({"Chromosome":concat,"Start_Position":concat,"End_Position":concat,"Strand":concat,"Variant_Classification":concat,"Reference_Allele":concat,"Tumor_Seq_Allele2":concat,"Protein_position":concat,"protein_mutation":concat,"Variant_Type":concat})

df_muts["Phenotype"] = df_muts.apply(lambda row: select_phenotype(row),axis=1)

df_muts.drop_duplicates().to_csv("data/tcga_muts.tsv.gz", sep="\t", compression="gzip", index=False)

print('Samples ', len(df_muts["Matchable_Sample_ID"].unique()))

Samples  9104


In [1]:
import json
import pandas as pd

def coadread_mapping(ctype):
    if ctype in ["COAD", "READ"]:
        return "COADREAD"
    else:
        return ctype
    
def list_samples(group, dict_):
    dict_[group.name] = list(set(group['Matchable_Sample_ID'].unique()))
    
def get_protein_mutation(row):
    try:
        pos=int(row["Protein_position"])
    except ValueError:
        return "."
    wtaa=row["Amino_acids"].split("/")[0]
    if "/" in row["Amino_acids"]:
        mtaa = row["Amino_acids"].split("/")[1]
    else:
        mtaa = wtaa
    return wtaa+str(pos)+mtaa

def concat(grp):
    l = list(grp)
    return ",".join([str(e) for e in list(grp)])

def select_phenotype(row):
    
    values = row["Variant_Classification"].split(",")
    # List of prioritys from the TCGA data
    list_priority = ["Splice_Site","Nonsense_Mutation","Frame_Shift_Del","Frame_Shift_Ins","Nonstop_Mutation","Translation_Start_Syte","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Intron","Silent","5'UTR","3'UTR","IGR","5'Flank","3'Flank"]
                     
    for priority in list_priority:
        if priority in values:
            return priority
    return "-"


# Load info
df_info_samples = pd.read_csv("external/tcga_mutations/tissueSourceSite.tsv", sep="\t", keep_default_na=False)
df_ttypes_names = pd.read_csv("external/tcga_mutations/diseaseStudy.tsv",sep="\t")
df_info_samples = df_info_samples.merge(df_ttypes_names, on=["Study Name"])
df_info_samples.rename(columns={"Study Abbreviation":"Cancer_Type"},inplace=True)
df_test = pd.read_csv("external/TCGA-CDR-SupplementalTableS1.csv",sep="\t")



samples_filtered = df_info_samples[["TSS Code","Cancer_Type"]].drop_duplicates()
d_ttype = dict(zip(samples_filtered["TSS Code"], samples_filtered["Cancer_Type"]))

# Load mutations
df_muts = pd.read_csv("external/tcga_mutations/mc3.v0.2.8.PUBLIC.maf.gz", sep="\t", low_memory=False)
df_muts = df_muts[df_muts["FILTER"]=="PASS"]
df_muts["Cancer_Type"] = df_muts.apply(lambda row: d_ttype[row["Tumor_Sample_Barcode"].split("-")[1]], axis=1)
df_muts["Matchable_Sample_ID"] = df_muts["Tumor_Sample_Barcode"].str[0:12]


# list distinc samples IDs per cancer type

d_total_mutations = {}
df_muts.groupby("Cancer_Type").apply(list_samples, dict_=d_total_mutations)
with open("data/tcga_muts_samples.json", 'w') as fd:
    json.dump(d_total_mutations, fd)


df_muts["Cancer_Type"] = df_muts["Cancer_Type"].map(coadread_mapping)
df_muts["protein_mutation"] = df_muts.apply(lambda row: get_protein_mutation(row),axis=1)
df_muts = df_muts[["Matchable_Sample_ID","Hugo_Symbol","Cancer_Type","Chromosome","Start_Position","End_Position","Strand","Reference_Allele","Tumor_Seq_Allele2","Protein_position","protein_mutation","Variant_Classification","CCDS","Variant_Type"]].drop_duplicates()
df_muts = df_muts.groupby(["Hugo_Symbol","Matchable_Sample_ID","CCDS","Cancer_Type"],as_index=False).agg({"Chromosome":concat,"Start_Position":concat,"End_Position":concat,"Strand":concat,"Variant_Classification":concat,"Reference_Allele":concat,"Tumor_Seq_Allele2":concat,"Protein_position":concat,"protein_mutation":concat,"Variant_Type":concat})

df_muts["Phenotype"] = df_muts.apply(lambda row: select_phenotype(row),axis=1)

df_muts.drop_duplicates().to_csv("data/tcga_muts.tsv.gz", sep="\t", compression="gzip", index=False)

print('Samples ', len(df_muts["Matchable_Sample_ID"].unique()))

Samples  9104


### Combination

Merge data (RPPA, CNA, RNA and mutations) per sample. Samples not mutated are considered WT.  
Include which samples has an upstream E3 ligases mutated and annotate them  
Calculate which mutations fall into the epitope region

In [None]:
from collections import defaultdict
import numpy as np
import pandas as pd

ppi_e3 = os.path.join(base,"preprocess/ppi/data/E3_target_annotated_interations.csv")

def concat(grp):
    l = list(grp)
    return ",".join(grp)

def find_altered_E3(row, d_mapping):
    mutated = str(row["Ubiquitinases_Mutated"]).split(",")
    ligases = d_mapping[row["Hugo_Symbol"]]
    l = []
    for ub in mutated:
        if ub in ligases:
            return True
    return False

def is_in_recognition(row):
    if row["protein_mutation"] == "-":
        return False
    v = row["Epitope"]
    if str(v)=="nan" or str(v) == "0" or row["Phenotype"]=="WT":
        return False
    if "-" in v:
        start = int(v.split("-")[0])
        end = int(v.split("-")[1])
    else:
        start = int(v)
        end = int(v)
    positions = row["Protein_position"].split(",")
    for position_raw in positions:
        for position_indel in position_raw.split("-"):
            try:
                position = int(position_indel)
            except ValueError:
                
                continue
            if position >= int(start) and position <= int(end):
                    return True
    return False

# Mapping protein-Hugo
df_antibodies=pd.read_csv("internal/antibody_parsed.csv")
d_mapping = defaultdict(list)
for index,row in df_antibodies[["Hugo_Symbol","Protein"]].drop_duplicates().iterrows():
    d_mapping[row["Protein"]].append(row["Hugo_Symbol"])

# RPPA
rppa = pd.read_csv("data/tcga_rppa.tsv", sep="\t")
columns=list(rppa_raw.columns.values)
columns.remove("Matchable_Sample_ID")
columns.remove("Cancer_Type")
rppa = rppa.melt(id_vars=["Matchable_Sample_ID","Cancer_Type"],value_vars=columns,value_name="RPPA_VALUES",var_name="Protein")

# discard samples with non finite RPPA
rppa = rppa[np.isfinite(rppa["RPPA_VALUES"])]
# include Hugo ID
rppa["Hugo_Symbol"] = rppa.apply(lambda row: d_mapping[row["Protein"]][0] if row["Protein"] in d_mapping else "-" , axis=1) # Select the first hugo symbol, only one hugo per antibody
# exclude antbodies without information
rppa = rppa[rppa["Hugo_Symbol"]!="-"]

# RNA
rna = pd.read_csv("data/tcga_rna.tsv.gz", sep="\t", compression="gzip")
rna.drop("Tumor_Sample_Barcode",axis=1,inplace=True)

df = pd.merge(rna,rppa)

# CNA
cna = pd.read_csv("data/tcga_cna.tsv.gz", sep="\t", compression="gzip")
cna = [["Hugo_Symbol","Matchable_Sample_ID","CNA"]].drop_duplicates()

df = pd.merge(df,cna)

# Mutations
muts = pd.read_csv("data/tcga_muts.tsv.gz", sep="\t", compression="gzip")
muts = muts[["Hugo_Symbol","CCDS","Cancer_Type","Phenotype","protein_mutation","Protein_position","Variant_Classification","Matchable_Sample_ID","Variant_Type"]].drop_duplicates()

# Match the CCDS for each samples, if not CCDS discard them
ccds = muts[["Hugo_Symbol","CCDS"]].drop_duplicates()
df = pd.merge(df, ccds, how="left")

# Include mutations
df = df.merge(muts, how="left")
df["Phenotype"].fillna("WT", inplace=True)
df["Variant_Classification"].fillna("-", inplace=True)

# Load altered E3 ligases
ubiquitins = []
with open("internal/curated_ub_du.lst") as fd:
    ubiquitins = [l.rstrip() for l in fd]

e3_ligases = pd.read_csv(path_mutations,sep="\t",compression="gzip")
e3_ligases = e3_ligases[e3_ligases["Hugo_Symbol"].isin(ubiquitins)]
e3_ligases = e3_ligases[(e3_ligases["Phenotype"]=="Nonsense_Mutation")|(e3_ligases["Phenotype"]=="Frame_Shift_Del")|(e3_ligases["Phenotype"]=="Frame_Shift_Ins")|(e3_ligases["Phenotype"]=="Missense_Mutation")|(e3_ligases["Phenotype"].str.contains("In_Frame"))|(e3_ligases["Phenotype"]=="Nonstop_Mutation")|(e3_ligases["Phenotype"]=="Splice_Site")]
e3_ligases = e3_ligases.groupby(["Matchable_Sample_ID"], as_index=False).agg({"Hugo_Symbol":concat})
e3_ligases.rename(columns={"Hugo_Symbol":"Ubiquitinases_Mutated"}, inplace=True)

df = df.merge(e3_ligases, how="left")
df["Ubiquitinases_Mutated"].fillna("-", inplace=True)

# Annotate samples with upstream mutated E3 ligases
df_ppi = pd.read_csv(ppi_e3, sep="\t")
d_mapping = {}
for hugo in rna_rppa_cna_muts["Hugo_Symbol"].unique():
    d_mapping[hugo]=list(df_ppi[(df_ppi["Hugo_SUB"]==hugo)]["Hugo_E3"].values)
    
df["Altered_E3_Ligases"] = df.apply(lambda row: find_altered_E3(row,d_mapping), axis=1)

# Include epitopes
df = df.merge(df_antibodies[["Protein", "Epitope"]].drop_duplicates())

# Disrupt epitopes
df["Disrupt_Epitope"] = df.apply(lambda row: is_in_recognition(row), axis=1)

df.drop_duplicates().to_csv("data/rppa_matched.tsv.gz", sep="\t",index=False,compression="gzip")
print("Samples:", len(set(df["Matchable_Sample_ID"].unique())))
print("Proteins:", len(set(df["Protein"].unique())))
print("Symbols:", len(set(df["Hugo_Symbol"].unique())))

In [13]:
print("Samples:", len(set(df["Matchable_Sample_ID"].unique())))
print("Proteins:", len(set(df["Protein"].unique())))
print("Symbols:", len(set(df["Hugo_Symbol"].unique())))

Samples: 6909
Proteins: 236
Symbols: 193


## CCLE data

Obtained from https://portals.broadinstitute.org/ccle/data and saved in ``ccle`` folder

### Cell line specific RPPA data

Combine the data with manual curation of some antibodies of interest (Included in the MD anderson dataset) and match for each antibody its HUGO.

In [8]:
import pandas as pd

df=pd.read_csv("external/ccle/CCLE_RPPA_20180123.csv", sep=",")
df.rename(columns={"Unnamed: 0":"Matchable_Sample_ID"},inplace=True)
df_rppa = df.melt(id_vars="Matchable_Sample_ID",value_vars=df.columns.values[1:],value_name="RPPA_VALUES",var_name="Protein") # Melt it

# Annotations from Broad
df_antibody = pd.read_csv('internal/antibody_ccle.csv', sep=",")[["Protein","Hugo_Symbol"]].drop_duplicates()

# Merge with CCLE
df_ccle = pd.merge(df_rppa, df_antibody)
df_ccle.to_csv('data/ccle_rppa.tsv' ,sep="\t", index=False)

print('Cell lines: ', len(df_ccle["Matchable_Sample_ID"].unique()))
print('Antibodies: ', len(df_ccle["Protein"].unique()))

Cell lines:  899
Antibodies:  214


### RNA data

Filter to keep only information of proteins with RPPA data and compute the log2 of the RPKM

In [9]:
import numpy as np
import pandas as pd

df_rna = pd.read_csv("external/ccle/CCLE_DepMap_18q3_RNAseq_RPKM_20180718.gct", sep="\t", skiprows=2)
df_rna = df_rna.melt(id_vars=["Name","Description"], value_vars=df_rna.columns.values[2:], value_name="RPKM", var_name="Sample_ID")
df_rna.rename(columns={"Description":"Hugo_Symbol"}, inplace=True)
df_rna["Matchable_Sample_ID"]= df_rna.apply(lambda row: row["Sample_ID"].split(" ")[0], axis=1)

hugos = pd.read_csv('data/ccle_rppa.tsv', sep="\t")["Hugo_Symbol"].unique()
df_rna = df_rna[df_rna["Hugo_Symbol"].isin(hugos)]

df_rna["log2(RPKM)"] = np.log2(df_rna["RPKM"] + 0.0001)
df_rna.drop(["Name","Sample_ID","RPKM"],axis=1,inplace=True)
df_rna.to_csv("data/ccle_rna.tsv.gz", sep="\t", compression="gzip", index=False)

### CNA data

Filter to keep only information of proteins with RPPA data.  
Classify the data into 5 categories

In [10]:
import numpy as np
import pandas as pd

def classify(row, mean_high, mean_low):
    high = mean_high if row["High"] is np.nan else row["High"]
    low = mean_low if row["Low"] is np.nan else row["Low"]
    
    if row["CNA_RAW"] > 0: # amp
        if row["CNA_RAW"] > high:
            return 2
        elif row["CNA_RAW"] > 0.3:
            return 1
        else:
            return 0
    else:
        if row["CNA_RAW"] < low:
            return -2
        elif row["CNA_RAW"] < -0.3:
            return -1
        else:
            return 0

df_cna = pd.read_csv("external/ccle/CCLE_copynumber_byGene_2013-12-03.txt", sep="\t")
df_cna = df_cna.melt(id_vars=["EGID","SYMBOL","CHR","CHRLOC","CHRLOCEND"],value_vars=df_cna.columns.values[5:],value_name="CNA",var_name="Matchable_Sample_ID")
df_cna.drop(["EGID","CHRLOC","CHR","CHRLOCEND"],axis=1,inplace=True)
df_cna.rename({"SYMBOL":"Hugo_Symbol","CNA":"CNA_RAW"},axis=1,inplace=True)

hugos = pd.read_csv('data/ccle_rppa.tsv', sep="\t")["Hugo_Symbol"].unique()
df_cna = df_cna[df_cna["Hugo_Symbol"].isin(hugos)]

# Load thresholds to set up a ABSOLUTE CNA
df_thresholds = pd.read_csv("internal/CCLE_copynumber_2012-04-05.seg.sample_cutoffs.txt", sep="\t", skiprows=1)
df_cna = pd.merge(df_cna, df_thresholds, how="left")

mean_high = df_cna["High"].mean()       
mean_low = df_cna["Low"].mean()       
df_cna["CNA"] = df_cna.apply(lambda row: classify(row, mean_high, mean_low), axis=1)
df_cna.drop(["CNA_RAW","High","Low"], axis=1, inplace=True)
df_cna["Matchable_Sample_ID"] = df_cna.apply(lambda row: row["Matchable_Sample_ID"], axis=1)
df_cna.to_csv('data/ccle_cna.tsv.gz', sep="\t", compression="gzip", index=False)

### Mutational data

All alterations in the same gene are combined   
Phenotype is selected from this order of priority  
CCDS  is included from the transcript ID

The CCDS file is downloaded from http://grch37.ensembl.org/biomart/martview/08bfd74632dad1fba47faf5ad3a5fe22 in TSV format. The downloaded file is named as ``ccds_ensembl.tsv`` and the columns renamed to: ``Gene``, ``Transcript``and ``CCDS``.

In [11]:
import re
import pandas as pd

TCGA_PRIORITIES = ["Splice_Site","Nonsense_Mutation","Frame_Shift_Del","Frame_Shift_Ins","Nonstop_Mutation","Translation_Start_Syte","In_Frame_Del","In_Frame_Ins","Missense_Mutation","Intron","Silent","5'UTR","3'UTR","IGR","5'Flank","3'Flank"]
DNP_REGEX = re.compile("([0-9_]+)[A-Z]+>[A-Z]+")#66_66G>GG
DEL_REGEX = re.compile("[A-Z\*]*([0-9]+)del")
INS_REGEX = re.compile("([0-9_]+)+ins[A-Z\*]+")
OTHER_REGEX = re.compile("([0-9_]+)[A-Z\*]+>[A-Z\*]+")#66_66G>GG

def concat(grp):
    l = list(grp)
    return ",".join([str(e) for e in list(grp)])

def select_phenotype(row):
    values = row["Variant_Classification"].split(",")
    for priority in TCGA_PRIORITIES:
        if priority in values:
            return priority
    return "-"

def get_protein_mutation(row):
    try:
        mutation=row["Protein_Change"].split(".")[1]
    except:
        return "."
    return mutation

def get_protein_position(row):
    if (row["Variant_Classification"]=="Missense_Mutation" or row["Variant_Classification"]=="Silent") and row["Variant_Type"]=="SNP": 
        return row["protein_mutation"][1:-1]
    elif (row["Variant_Classification"]=="Missense_Mutation" or row["Variant_Classification"]=="Silent") and row["Variant_Type"]=="DNP": 
        m = re.search(DNP_REGEX, row["protein_mutation"])
        if m:
            return m.group(1).replace("_","-")
        return "."
    elif row["Variant_Classification"]=="In_Frame_Del" or row["Variant_Classification"]=="In_Frame_Ins":
        m = re.search(DEL_REGEX,row["protein_mutation"])
        if m:
            return m.group(1)
        m = re.search(INS_REGEX,row["protein_mutation"])
        if m:
            return m.group(1).replace("_","-")
        m = re.search(OTHER_REGEX,row["protein_mutation"])
        if m:
            return m.group(1).replace("_","-")
        return "."
    return "."


df_mutations = pd.read_csv("external/ccle/CCLE_DepMap_18q3_maf_20180718.txt", sep="\t", low_memory=False)
df_mutations["Matchable_Sample_ID"] = df_mutations["Tumor_Sample_Barcode"]
df_mutations = df_mutations[["Hugo_Symbol","Chromosome","Start_position","End_position","Strand","Variant_Classification","Reference_Allele","Tumor_Seq_Allele1","Matchable_Sample_ID","Protein_Change","Codon_Change","Annotation_Transcript","Variant_Type"]].drop_duplicates()

# Add CCDS
df_mutations["Transcript"] = df_mutations.apply(lambda row: row["Annotation_Transcript"].split(".")[0], axis=1)
df_cds = pd.read_csv('external/ccds_ensembl.tsv', sep="\t")[["CCDS","Transcript"]].drop_duplicates()
df_m = pd.merge(df_mutations, df_cds)
df_mutations_cds = df_m[~pd.isnull(df_m["CCDS"])].copy()
df_mutations_cds.rename(columns={"CCDS":"CCDS_BASIC"}, inplace=True)
df_uniprot = pd.read_csv('data/uniprot_isoforms.tsv', sep="\t")
df_uniprot["CCDS_BASIC"] = df_uniprot.apply(lambda row: str(row["CCDS"]).split(".")[0], axis=1)
df_mutations_cds = pd.merge(df_uniprot[["Hugo_Symbol","CCDS","CCDS_BASIC","Entry_Isoform","Entry"]].drop_duplicates(),df_mutations_cds)
df_mutations_cds["protein_mutation"] = df_mutations_cds.apply(lambda row: get_protein_mutation(row),axis=1)
df_mutations_cds["Protein_position"] = df_mutations_cds.apply(lambda row: get_protein_position(row),axis=1)

mutations_sample = df_mutations_cds[["Matchable_Sample_ID","Hugo_Symbol","Chromosome","Start_position","End_position","Strand","Reference_Allele","Tumor_Seq_Allele1","Protein_position","protein_mutation","Variant_Classification","CCDS","Variant_Type"]].drop_duplicates()
mutations_sample_u = mutations_sample.groupby(["Hugo_Symbol","Matchable_Sample_ID","CCDS"],as_index=False).agg({"Chromosome":concat,"Start_position":concat,"End_position":concat,"Strand":concat,"Variant_Classification":concat,"Reference_Allele":concat,"Tumor_Seq_Allele1":concat,"Protein_position":concat,"protein_mutation":concat,"Variant_Type":concat})
mutations_sample_u["Phenotype"] = mutations_sample_u.apply(lambda row: select_phenotype(row), axis=1)
mutations_sample_u = mutations_sample_u.drop_duplicates()
mutations_sample_u["Matchable_Sample_ID"] = mutations_sample_u.apply(lambda row: row["Matchable_Sample_ID"], axis=1)
mutations_sample_u.drop_duplicates().to_csv('data/ccle_muts.tsv.gz', sep="\t", compression="gzip", index=False)

## Mass spectrometry data

Mass-pectrometry data from human high-grade serous ovarian cancer and breast cancer TCGA cohorts

Download the data in ``external/tcga_ms`` using https://github.com/compgenome365/TCGA-Assembler-2 and executing with R:
```R
DownloadCPTACData(cancerType = "BRCA",assayPlatform = "proteome_iTRAQ", saveFolderName = 'external')
DownloadCPTACData(cancerType = "OV",assayPlatform = "proteome_iTRAQ", saveFolderName = 'external')
```

Data was download on 20/10/2018

- Filter for only iTRAQ datasets i.e. BRCA, OV (discard non itraq data)
- Use the "Unshared" measurements

In [5]:
import glob
import re
import pandas as pd

list_dfs = []
for filename in glob.iglob('external/tcga_ms/*.txt'):
    m = re.search('external/tcga_ms/([A-Z]+)__.*\.txt', filename)
    if m:
        cancer_type = m.group(1)
        df = pd.read_csv(filename,sep="\t")
        df["Cancer_Type"] = cancer_type
        df_melted = df.melt(id_vars=list(df.columns.values[0:5])+["Cancer_Type"],value_vars=df.columns.values[5:-1],var_name="SAMPLE_INFO",value_name="log_ratio(iTRAQ)")
        df_melted = df_melted[df_melted["SAMPLE_INFO"].str.contains("Unshared-Log-Ratio")] # select unshared spectral
        if df_melted.shape[0] >0:
            df_melted["Matchable_Sample_ID"] = df_melted.apply(lambda row: row["SAMPLE_INFO"][0:12],axis=1) 
            df_melted["Hugo_Symbol"] = df_melted["Gene"]
            df_melted = df_melted[["Hugo_Symbol","Cancer_Type","Matchable_Sample_ID","log_ratio(iTRAQ)"]].drop_duplicates() 
            list_dfs.append(df_melted)

df = pd.concat(list_dfs)
df.drop_duplicates().to_csv('data/MS-data-parsed.tsv.gz', sep="\t", index=False, compression="gzip")

print('Proteins ', len(df["Hugo_Symbol"].unique()))

Proteins  11064


### RNA data

The input data is the same as for TCGA RNA data

In [7]:
import numpy as np
import pandas as pd


# Read samples with data
df_ms = pd.read_csv("data/MS-data-parsed.tsv.gz", sep="\t", compression="gzip")
samples = df_ms["Matchable_Sample_ID"].unique()
ttypes = df_ms["Cancer_Type"].unique()


df_info_samples = pd.read_csv("external/tcga_rna/merged_sample_quality_annotations.tsv",sep="\t")
samples_filtered = df_info_samples[(df_info_samples["platform"]=="IlluminaHiSeq_RNASeqV2")&(~df_info_samples["Do_not_use"])&(~pd.isnull(df_info_samples["cancer type"]))][["aliquot_barcode","cancer type"]].drop_duplicates()

df_rna = pd.read_csv("external/tcga_rna/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv",sep="\t")
df_rna["Hugo_Symbol"] = df_rna.apply(lambda row: row["gene_id"].split("|")[0],axis=1)
df_rna = df_rna[df_rna["Hugo_Symbol"]!="?"]
df_rna.drop("gene_id",axis=1,inplace=True)

df_rna = df_rna.melt(id_vars="Hugo_Symbol",value_name="RSEM",var_name="aliquot_barcode").merge(samples_filtered,how="right")
df_rna.rename(columns={"aliquot_barcode":"Tumor_Sample_Barcode", "cancer type": "Cancer_Type"},inplace=True)
df_rna["Matchable_Sample_ID"] = df_rna["Tumor_Sample_Barcode"].str[0:12]
df_rna = df_rna[(df_rna["Cancer_Type"].isin(ttypes)) & (df_rna["Matchable_Sample_ID"].isin(samples))]
df_rna.drop(columns=["Tumor_Sample_Barcode"],inplace=True)
df_rna=df_rna.groupby(["Hugo_Symbol","Matchable_Sample_ID","Cancer_Type"],as_index=False).agg({"RSEM":np.nanmean})
df_rna["log2(RSEM)"] = np.log2(df_rna["RSEM"] + 0.0001)
df_rna.to_csv('data/tcga_rna_ms.tsv.gz', sep="\t", compression="gzip", index=False)
print('Samples: ', len(df_rna["Matchable_Sample_ID"].unique()))

Samples:  201


### CNA data

The input data is the same as for TCGA CNA data

In [1]:
import os
import tarfile
import pandas as pd

df_ms = pd.read_csv('data/MS-data-parsed.tsv.gz', sep="\t", compression="gzip")
samples = df_ms["Matchable_Sample_ID"].unique()
ttypes = df_ms["Cancer_Type"].unique()

# Read CNA data
list_values = []
for file in os.listdir('external/tcga_cna/'):
    cancer_type = file.split('-')[0].replace('gdac.broadinstitute.org_', '')
    if not(cancer_type in ttypes):
            continue
    tar = tarfile.open("external/tcga_cna/{}".format(file), "r:gz")
    f = tar.extractfile('{}/all_thresholded.by_genes.txt'.format(file.replace('.tar.gz', '')))
    df = pd.read_csv(f, sep="\t", error_bad_lines=False)
    df_cna = pd.melt(df,id_vars=["Gene Symbol","Locus ID","Cytoband"],var_name="Tumor_Sample_Barcode",value_name="CNA")
    df_cna["Matchable_Sample_ID"] = df_cna["Tumor_Sample_Barcode"].str[0:12]
    df_cna.drop("Tumor_Sample_Barcode",axis=1,inplace=True)
    df_cna = df_cna[df_cna["Matchable_Sample_ID"].isin(samples)]
    list_values.append(df_cna)

df_cna = pd.concat(list_values).drop_duplicates()
df_cna.rename(columns={"Gene Symbol":"Hugo_Symbol"}, inplace=True)
df_cna.to_csv('data/CNA_filtered_ms.tsv.gz', compression="gzip", sep="\t")

## Proteins involved in ubuquitintation

Manually created list: internal/curated_ub_du.lst

In [None]:
%%bash

human_proteins=`cat internal/curated_ub_du.lst | wc -l`
echo "Human proteins ${human_proteins}" 

## E3 ligase substrate interactions

Download a curated list of E3 ligases substrate interactions from http://pnet.kaist.ac.kr/e3net/  
Add manual anotations

In [1]:
import pandas as pd

df_uniprot = pd.read_csv('data/uniprot_isoforms.tsv', sep="\t")[["Hugo_Symbol","Entry"]].drop_duplicates()
df_interactions = pd.read_excel('external/E3Net_E3-SubRelation.xlsx')

# Filter human interactions
df_interactions_human = df_interactions[df_interactions["SUB_ID"].str.contains("HUMAN")]

# Add Hugo to E3
df_interactions_human = df_interactions_human.merge(df_uniprot, right_on=["Entry"], left_on=["E3_AC"])
df_interactions_human.rename(columns={"Hugo_Symbol":"Hugo_E3"}, inplace=True)

# Add Hugo to substrate
df_interactions_human = df_interactions_human.merge(df_uniprot, right_on=["Entry"], left_on=["SUB_AC"])
df_interactions_human.rename(columns={"Hugo_Symbol":"Hugo_SUB"}, inplace=True)

df_interactions_human.drop(["Entry_x","Entry_y"],axis=1,inplace=True)

# Add manual interactions
df_manual = pd.DataFrame([["APC_HUMAN","P25054","APC","APC","COMPLEX","APC","CTNNB1","P35222","-","-","APC","CTNNB1"]], columns=df_interactions_human.columns.values)
df_final = pd.concat([df_interactions_human,df_manual])

df_final.to_csv('data/E3_target_annotated_interations.tsv', sep="\t", index=False)

print('Proteins involved', len(df_final))

Proteins involved 833


## Protein-protein interactions

Download the data from https://stringdb-static.org/download/protein.links.detailed.v10.5/9606.protein.links.detailed.v10.5.txt.gz and https://string-db.org/mapping_files/uniprot_mappings/full_uniprot_2_string.04_2015.tsv.gz      
Map Hugo symbols to UniProt  
Filter out interaction without a protein involved in ubiquitination  
Filter out pairs with a score below 300

In [1]:
import pandas as pd

df_annotated_interactions = pd.read_csv('external/9606.protein.links.detailed.v10.5.txt.gz', sep=" ", header=0, compression="gzip")
df_annotated_interactions = df_annotated_interactions[df_annotated_interactions["combined_score"]>300]  # filter out below 300

# Create a mapping of string to uniprot
df_uniprot = pd.read_csv('data/uniprot_isoforms.tsv', sep="\t")[["Hugo_Symbol","Entry"]].drop_duplicates()
df_ids = pd.read_csv('external/full_uniprot_2_string.04_2015.tsv.gz', sep="\t", header=None, names=["SPECIE","UNIPROT_NAME","STRING_ID","IDENTITY","SCORE"])
df_ids = df_ids[df_ids['SPECIE'] == 9606]  # keep only human
df_ids["Entry"] = df_ids.apply(lambda row: row["UNIPROT_NAME"].split("|")[0],axis=1)
df_ids["NAME"] = df_ids.apply(lambda row: row["UNIPROT_NAME"].split("|")[1],axis=1)
df_ids["STRING_ID"] = df_ids.apply(lambda row: "9606."+row["STRING_ID"],axis=1)
df_ids = pd.merge(df_uniprot, df_ids, how="right")
df_ids = df_ids[["Hugo_Symbol","Entry","STRING_ID"]].drop_duplicates()
df_ids = df_ids[~pd.isnull(df_ids["Hugo_Symbol"])]

# Annotate protein 1
df_annotated_interactions = pd.merge(df_ids, df_annotated_interactions[["protein1","protein2","experimental","combined_score"]].drop_duplicates(), left_on=["STRING_ID"], right_on="protein1")
df_annotated_interactions.rename(columns={"Hugo_Symbol":"Hugo1", "Entry":"Entry1"},inplace=True)
df_annotated_interactions.drop(["STRING_ID","protein1"], axis=1, inplace=True)
df_annotated_interactions.drop_duplicates(inplace=True)
df_annotated_interactions.rename(columns={"Hugo_Symbol":"Hugo2","Entry":"Entry2"},inplace=True)

# Annotate protein 2
df_annotated_interactions = pd.merge(df_ids ,df_annotated_interactions, left_on=["STRING_ID"], right_on="protein2")
df_annotated_interactions = df_annotated_interactions.drop(["STRING_ID","protein2"],axis=1)
df_annotated_interactions.rename(columns={"Hugo_Symbol":"Hugo2","Entry":"Entry2"}, inplace=True)

df_annotated_interactions.to_csv('data/ppi_all_targets.tsv.gz', sep="\t", index=False,compression="gzip")

print('Interactions: ', len(df_annotated_interactions))
print('Proteins: ', len(df_annotated_interactions["Hugo1"].unique()))


human_ubiquitins_names = []
with open('internal/curated_ub_du.lst') as f:
    for line in f:
        human_ubiquitins_names.append(line.strip())

print('E3 ligases: ', len(df_annotated_interactions[df_annotated_interactions["Hugo1"].isin(human_ubiquitins_names)]["Hugo1"].unique()))

  interactivity=interactivity, compiler=compiler, result=result)


Interactions:  2568513
Proteins:  18403
E3 ligases:  566


## PTMs

Downloads datasets (phosphorilation sites and ubiquitination sites) from https://www.phosphosite.org/ 
(Version Thu Oct 04 11:40:34 EDT 2018)

For each dataset we included integer position of the modification and only human data is stored

In [2]:
import pandas as pd

### Phosphorilation

In [3]:
df_phospho = pd.read_csv("external/Phosphorylation_site_dataset.gz", sep="\t", skiprows=3)
df_phospho = df_phospho[df_phospho["ORGANISM"]=="human"]
df_phospho["Position"] = df_phospho.apply(lambda row: int(row["MOD_RSD"].split("-")[0][1:]),axis=1)
df_phospho.to_csv("data/phosphorylation_sites_human.tsv.gz", sep="\t", compression="gzip", index=False)

### Ubiquitination

In [4]:
df_ub = pd.read_csv("external/Ubiquitination_site_dataset.gz", sep="\t", skiprows=3)
df_ub = df_ub[df_ub["ORGANISM"]=="human"]
df_ub["Position"] = df_ub.apply(lambda row: int(row["MOD_RSD"].split("-")[0][1:]),axis=1)
df_ub.to_csv("data/ubiquitination_sites_human.tsv.gz", sep="\t", compression="gzip", index=False)

  interactivity=interactivity, compiler=compiler, result=result)
