In [38]:
import pandas as pd
import numpy as np

## Humsavar Table

In [39]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")
hum.columns = hum.columns.str.replace(" ", "_")

In [40]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

In [41]:
hum.columns

Index(['Main_gene_name', 'Swiss_Prot_AC', 'FTId', 'AA_Change',
       'Type_of_variant', 'dbSNP', 'Disease_Name'],
      dtype='object')

In [42]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-"+ df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")
# mutant = pd.Series(data=(df.C1 + df.C2 + df.C3).tolist(), name="MUTANT")

In [43]:
hum_final = pd.concat([hum, mutant, df], 1)[["MUTANT", "Swiss_Prot_AC", "C2", "C1", "C3","dbSNP", "Type_of_variant"]]
hum_final.rename(columns={"C1": "WildType", "C3": "Mut", "Swiss_Prot_AC": "UID", "C2": "Pos"}, inplace=True)
# hum_final = pd.concat([hum, mutant], 1)[["FTId", "MUTANT"]]

In [44]:
hum_final.head()

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,Type_of_variant
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism
3,Q9NQ94-558-A-S,Q9NQ94,558,A,S,rs11817448,Polymorphism
4,A8K2U0-207-G-R,A8K2U0,207,G,R,rs11047499,Polymorphism


In [45]:
hum_final.shape

(76730, 7)

In [46]:
hum_final.drop_duplicates("MUTANT", inplace=True)
# hum_final.drop_duplicates("dbSNP", inplace=True)
hum_final.rename(columns={"Type_of_variant": "TYPE"}, inplace=True)

In [47]:
hum_final.head(3)

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism


In [11]:
hum_final[hum_final.dbSNP.duplicated(keep=False) & ~hum_final.dbSNP.isnull()]

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
97,Q86UK0-459-S-T,Q86UK0,459,S,T,rs7560008,Polymorphism
104,Q86UK0-777-S-T,Q86UK0,777,S,T,rs7560008,Polymorphism
145,O95477-590-W-S,O95477,590,W,S,rs137854496,Disease
191,O95477-590-W-L,O95477,590,W,L,rs137854496,Disease
231,P78363-96-N-D,P78363,96,N,D,rs61748529,Disease
232,P78363-96-N-H,P78363,96,N,H,rs61748529,Disease
249,P78363-572-R-P,P78363,572,R,P,rs61748559,Disease
250,P78363-572-R-Q,P78363,572,R,Q,rs61748559,Disease
332,P78363-2077-R-W,P78363,2077,R,W,rs61750645,Disease
353,P78363-68-P-L,P78363,68,P,L,rs62654397,Disease


In [12]:
hum_final.to_csv("../data/interim/humsavar_full.csv.gz", index=False, compression="gzip")
# hum_final.to_csv("../data/interim/humsavar_VEST.csv.gz", index=False, compression="gzip")

In [48]:
hum_final = pd.read_csv("../data/interim/humsavar_full.csv.gz")

In [49]:
hum_final.shape

(75769, 7)

## SNVBox Features 

### Genomic MSA (Genomic)

In [15]:
genomic_msa = pd.read_csv("../data/external/snvbox_features/genomic_msa_features.csv").drop_duplicates()
genomic_msa = genomic_msa.groupby(["UID", "Pos"]).mean().reset_index()

In [16]:
hum_final = hum_final.merge(genomic_msa, on=['UID', 'Pos'], how='left')

### Exon Features (Genomic)

In [50]:
exon_features = pd.read_csv("../data/external/snvbox_features/exon_features.csv", delimiter="\t")
exon_features.rename(columns={"rsid": "dbSNP"}, inplace=True)

In [51]:
exon_features.head()

Unnamed: 0,dbSNP,Cons,snp_den,hapmap_snp_den
0,rs1800433,0.709708,0.045198,0.00565
1,rs669,0.709708,0.045198,0.00565
2,rs3180392,0.648678,0.034934,0.0
3,rs1800434,0.541717,0.017857,0.0
4,rs226405,0.635459,0.018519,0.0


## Phylogenetic Features (Genomic)

In [52]:
phyloP46way = pd.read_csv("../data/interim/phyloP46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phyloP46way, on="dbSNP", how="left")

phastCons46way = pd.read_csv("../data/interim/phastCons46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phastCons46way, on="dbSNP", how="left")

## Genomic Func Classes

In [53]:
variables = ["name", "intron", "missense", "near-gene", "ncRNA", "coding-synon", "untranslated", "nonsense", "splice", "frameshift", "cds-indel", "stop-loss"]
dummy = pd.read_csv("../data/interim/dbsnp150_humsavar_funcDummie.tsv", sep="\t")
dummy = dummy[variables].rename(columns={"name":"dbSNP"}).drop_duplicates(subset="dbSNP")
hum_final = hum_final.merge(dummy, on="dbSNP", how="left")

## Create Train Test Sets

In [54]:
hum_final.columns = hum_final.columns.str.upper()

In [55]:
hum_final.columns

Index(['MUTANT', 'UID', 'POS', 'WILDTYPE', 'MUT', 'DBSNP', 'TYPE',
       'PHYLOP46WAY', 'PHASTCONS46WAY', 'INTRON', 'MISSENSE', 'NEAR-GENE',
       'NCRNA', 'CODING-SYNON', 'UNTRANSLATED', 'NONSENSE', 'SPLICE',
       'FRAMESHIFT', 'CDS-INDEL', 'STOP-LOSS'],
      dtype='object')

In [56]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

In [57]:
hum_final.replace([np.inf,-np.inf], np.nan, inplace=True)
hum_final.to_csv("../data/processed/genomic/humsavar_full_gt.csv.gz", index=False, compression="gzip")

In [58]:
hum_final.columns.shape

(20,)

In [59]:
hum_final = pd.read_csv("../data/processed/genomic/humsavar_full_gt.csv.gz")

In [60]:
hum_final.shape

(75769, 20)

In [61]:
hum_final.isnull().sum()

MUTANT                0
UID                   0
POS                   0
WILDTYPE              0
MUT                   0
DBSNP             17590
TYPE                  0
PHYLOP46WAY       18603
PHASTCONS46WAY    18603
INTRON            18136
MISSENSE          18136
NEAR-GENE         18136
NCRNA             18136
CODING-SYNON      18136
UNTRANSLATED      18136
NONSENSE          18136
SPLICE            18136
FRAMESHIFT        18136
CDS-INDEL         18136
STOP-LOSS         18136
dtype: int64

In [62]:
hum_final.drop(["UID", "POS","DBSNP", "WILDTYPE", "MUT"], axis=1, inplace=True)

dataset = hum_final.set_index("MUTANT")
unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

In [63]:
dataset.to_csv("../data/processed/genomic/dataset.csv.gz", index=True, compression="gzip")

In [64]:
hum_final.describe()

Unnamed: 0,PHYLOP46WAY,PHASTCONS46WAY,INTRON,MISSENSE,NEAR-GENE,NCRNA,CODING-SYNON,UNTRANSLATED,NONSENSE,SPLICE,FRAMESHIFT,CDS-INDEL,STOP-LOSS
count,57166.0,57166.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0,57633.0
mean,2.197496,0.678389,0.136571,0.991099,0.060521,0.179029,0.020041,0.05417,0.005587,0.000486,5.2e-05,6.9e-05,5.2e-05
std,2.288608,0.435642,0.343397,0.093926,0.238451,0.38338,0.14014,0.226356,0.074538,0.022036,0.007215,0.008331,0.007215
min,-8.217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.316,0.07,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.888,0.996,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,4.273,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,6.424,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [65]:
hum_final.shape

(75769, 15)

In [66]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

### Multiple Train Tests

In [None]:
for i in range(20):
    train = pd.concat([
        dataset.loc[dataset_disease_index].sample(n=25000, random_state=i),
        dataset.loc[dataset_poly_index].sample(n=25000, random_state=i)
    ]).sample(frac=1)
    
    test = pd.concat([
        dataset.loc[dataset_disease_index.difference(train.index)].sample(n=3855, random_state=i),
        dataset.loc[dataset_poly_index.difference(train.index)].sample(n=14653, random_state=i)
    ]).sample(frac=1)
    train.to_csv("../data/processed/3/train_test_sets/train_{}.csv".format(i), index=True, index_label="MUTANT")
    test.to_csv("../data/processed/3/train_test_sets/test_{}.csv".format(i), index=True, index_label="MUTANT")