In [1]:
import pandas as pd
import numpy as np

## Humsavar Table

In [None]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")
hum.columns = hum.columns.str.replace(" ", "_")

In [None]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

In [None]:
hum.columns

In [None]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-"+ df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")
# mutant = pd.Series(data=(df.C1 + df.C2 + df.C3).tolist(), name="MUTANT")

In [None]:
hum_final = pd.concat([hum, mutant, df], 1)[["MUTANT", "Swiss_Prot_AC", "C2", "C1", "C3","dbSNP", "Type_of_variant"]]
hum_final.rename(columns={"C1": "WildType", "C3": "Mut", "Swiss_Prot_AC": "UID", "C2": "Pos"}, inplace=True)
# hum_final = pd.concat([hum, mutant], 1)[["FTId", "MUTANT"]]

In [None]:
hum_final.head()

In [None]:
hum_final.shape

In [None]:
hum_final.drop_duplicates("MUTANT", inplace=True)
# hum_final.drop_duplicates("dbSNP", inplace=True)
hum_final.rename(columns={"Type_of_variant": "TYPE"}, inplace=True)

In [None]:
hum_final.head(3)

In [None]:
hum_final[hum_final.dbSNP.duplicated(keep=False) & ~hum_final.dbSNP.isnull()]

In [None]:
hum_final.to_csv("../data/interim/humsavar_full.csv.gz", index=False, compression="gzip")
# hum_final.to_csv("../data/interim/humsavar_VEST.csv.gz", index=False, compression="gzip")

In [2]:
hum_final = pd.read_csv("../data/interim/humsavar_full.csv.gz")

In [None]:
hum_final.shape

## SNVBox Features 

### AA Features (Protein)

In [3]:
AA_Features = pd.read_csv("../data/external/snvbox_features/AA_Features_snvbox.csv")

In [4]:
AA_Features.head()

Unnamed: 0,WildType,Mut,Charge,Volume,Hydrophobicity,Grantham,Polarity,Ex,PAM250,BLOSUM,JM,HGMD2003,VB,Transition,COSMIC,COSMICvsSWISSPROT,HAPMAP,COSMICvsHAPMAP
0,A,C,0,-0.663333,-0.4,195,-2.6,39,0.3,0,-0.2,0,0,0.0014,-10.586231,-13.556968,-9.012255,-1.573976
1,A,D,1,-0.75,10.8,126,4.9,19,-0.3,-2,0.22,111,5,0.0028,-7.408177,-10.378914,-5.183614,-2.224563
2,A,E,1,-1.66,9.8,107,4.2,27,-0.1,-1,0.21,54,0,0.0047,-6.71503,-9.685767,-6.447306,-0.267724
3,A,F,0,-3.376667,-2.1,113,-2.9,31,-2.6,-2,-0.49,0,0,0.0028,-8.976793,-11.94753,-7.913643,-1.06315
4,A,G,0,0.95,0.6,60,0.9,39,0.6,0,1.66,34,10,0.0049,-7.641792,-10.612529,-5.004922,-2.63687


In [5]:
hum_final = hum_final.merge(AA_Features, on=['WildType', 'Mut'], how='left')

### Uniprot Features (Protein)

In [6]:
Uniprot_Features = pd.read_csv("../data/external/snvbox_features/uniprot_features.csv")
Uniprot_Features.rename(columns={"Acc": "UID"}, inplace=True)

In [7]:
hum_final = hum_final.merge(Uniprot_Features, on=['UID', 'Pos'], how='left')

### Genomic MSA (Genomic)

In [None]:
genomic_msa = pd.read_csv("../data/external/snvbox_features/genomic_msa_features.csv").drop_duplicates()
genomic_msa = genomic_msa.groupby(["UID", "Pos"]).mean().reset_index()

In [None]:
hum_final = hum_final.merge(genomic_msa, on=['UID', 'Pos'], how='left')

### Exon Features (Genomic)

In [None]:
exon_features = pd.read_csv("../data/external/snvbox_features/exon_features.csv", delimiter="\t")
exon_features.rename(columns={"rsid": "dbSNP"}, inplace=True)

In [None]:
exon_features.head()

## Protparam (Protein)

In [8]:
protparam = pd.read_csv("../data/interim/protparam_features.csv.gz")

In [9]:
hum_final = hum_final.merge(protparam, on="MUTANT", how="left")

## Phylogenetic Features (Genomic)

In [None]:
phyloP46way = pd.read_csv("../data/interim/phyloP46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phyloP46way, on="dbSNP", how="left")

phastCons46way = pd.read_csv("../data/interim/phastCons46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phastCons46way, on="dbSNP", how="left")

## Genomic Func Classes)

In [None]:
variables = ["name", "intron", "missense", "near-gene", "ncRNA", "coding-synon", "untranslated", "nonsense", "splice", "frameshift", "cds-indel", "stop-loss"]
dummy = pd.read_csv("../data/interim/dbsnp150_humsavar_funcDummie.tsv", sep="\t")
dummy = dummy[variables].rename(columns={"name":"dbSNP"}).drop_duplicates(subset="dbSNP")
hum_final = hum_final.merge(dummy, on="dbSNP", how="left")

## Create Train Test Sets

In [10]:
hum_final.columns = hum_final.columns.str.upper()

In [11]:
hum_final.columns

Index(['MUTANT', 'UID', 'POS', 'WILDTYPE', 'MUT', 'DBSNP', 'TYPE', 'CHARGE',
       'VOLUME', 'HYDROPHOBICITY', 'GRANTHAM', 'POLARITY', 'EX', 'PAM250',
       'BLOSUM', 'JM', 'HGMD2003', 'VB', 'TRANSITION', 'COSMIC',
       'COSMICVSSWISSPROT', 'HAPMAP', 'COSMICVSHAPMAP', 'BINDING', 'ACT_SITE',
       'SITE', 'LIPID', 'METAL', 'CARBOHYD', 'DNA_BIND', 'NP_BIND', 'CA_BIND',
       'DISULFID', 'SE_CYS', 'MOD_RES', 'PROPEP', 'SIGNALP', 'TRANSMEM',
       'COMPBIAS', 'REP', 'MOTIF', 'ZN_FING', 'REGIONS', 'PPI', 'RNABD', 'TF',
       'LOC', 'MMBRBD', 'CHROM', 'POSTMODREC', 'POSTMODENZ',
       'AROMATICITY_DIFF', 'AROMATICITY_LOG_RATIO', 'ISOELECTRIC_POINT_DIFF',
       'ISOELECTRIC_POINT_LOG_RATIO', 'GRAVY_DIFF', 'GRAVY_LOG_RATIO',
       'INSTABILITY_INDEX_DIFF', 'INSTABILITY_INDEX_LOG_RATIO',
       'FLEXIBILITY_DIFF', 'FLEXIBILITY_LOG_RATIO'],
      dtype='object')

In [12]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

In [None]:
hum_final.replace([np.inf,-np.inf], np.nan, inplace=True)
hum_final.to_csv("../data/processed/3/humsavar_full_gt.csv.gz", index=False, compression="gzip")

In [None]:
hum_final.columns.shape

In [13]:
hum_final = pd.read_csv("../data/processed/1/humsavar_full_gt.csv.gz")

In [14]:
hum_final.shape

(75769, 61)

In [None]:
hum_final.isnull().sum()

In [15]:
hum_final.drop(["UID", "POS","DBSNP", "WILDTYPE", "MUT"], axis=1, inplace=True)

dataset = hum_final.set_index("MUTANT")
unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

In [None]:
dataset.to_csv("../data/processed/3/dataset.csv.gz", index=True, compression="gzip")

In [None]:
hum_final.describe()

In [None]:
hum_final.shape

In [None]:
hum_final.TYPE.value_counts()

### Single Train Test

### Multiple Train Tests

In [None]:
for i in range(20):
    train = pd.concat([
        dataset.loc[dataset_disease_index].sample(n=25000, random_state=i),
        dataset.loc[dataset_poly_index].sample(n=25000, random_state=i)
    ]).sample(frac=1)
    
    test = pd.concat([
        dataset.loc[dataset_disease_index.difference(train.index)].sample(n=3855, random_state=i),
        dataset.loc[dataset_poly_index.difference(train.index)].sample(n=14653, random_state=i)
    ]).sample(frac=1)
    train.to_csv("../data/processed/3/train_test_sets/train_{}.csv".format(i), index=True, index_label="MUTANT")
    test.to_csv("../data/processed/3/train_test_sets/test_{}.csv".format(i), index=True, index_label="MUTANT")