In [2]:
import pandas as pd
import numpy as np

## Humsavar Table

In [13]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")
hum.columns = hum.columns.str.replace(" ", "_")

In [14]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

In [15]:
hum.columns

Index(['Main_gene_name', 'Swiss_Prot_AC', 'FTId', 'AA_Change',
       'Type_of_variant', 'dbSNP', 'Disease_Name'],
      dtype='object')

In [16]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-"+ df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")
# mutant = pd.Series(data=(df.C1 + df.C2 + df.C3).tolist(), name="MUTANT")

In [17]:
hum_final = pd.concat([hum, mutant, df], 1)[["MUTANT", "Swiss_Prot_AC", "C2", "C1", "C3","dbSNP", "Type_of_variant"]]
hum_final.rename(columns={"C1": "WildType", "C3": "Mut", "Swiss_Prot_AC": "UID", "C2": "Pos"}, inplace=True)
# hum_final = pd.concat([hum, mutant], 1)[["FTId", "MUTANT"]]

In [18]:
hum_final.head()

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,Type_of_variant
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism
3,Q9NQ94-558-A-S,Q9NQ94,558,A,S,rs11817448,Polymorphism
4,A8K2U0-207-G-R,A8K2U0,207,G,R,rs11047499,Polymorphism


In [19]:
hum_final.shape

(76730, 7)

In [46]:
hum_final.drop_duplicates("MUTANT", inplace=True)
# hum_final.drop_duplicates("dbSNP", inplace=True)
hum_final.rename(columns={"Type_of_variant": "TYPE"}, inplace=True)

In [21]:
hum_final.head(3)

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism


In [22]:
hum_final[hum_final.dbSNP.duplicated(keep=False) & ~hum_final.dbSNP.isnull()]

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
97,Q86UK0-459-S-T,Q86UK0,459,S,T,rs7560008,Polymorphism
104,Q86UK0-777-S-T,Q86UK0,777,S,T,rs7560008,Polymorphism
145,O95477-590-W-S,O95477,590,W,S,rs137854496,Disease
191,O95477-590-W-L,O95477,590,W,L,rs137854496,Disease
231,P78363-96-N-D,P78363,96,N,D,rs61748529,Disease
232,P78363-96-N-H,P78363,96,N,H,rs61748529,Disease
249,P78363-572-R-P,P78363,572,R,P,rs61748559,Disease
250,P78363-572-R-Q,P78363,572,R,Q,rs61748559,Disease
332,P78363-2077-R-W,P78363,2077,R,W,rs61750645,Disease
353,P78363-68-P-L,P78363,68,P,L,rs62654397,Disease


In [23]:
hum_final.to_csv("../data/interim/humsavar_full.csv.gz", index=False, compression="gzip")
# hum_final.to_csv("../data/interim/humsavar_VEST.csv.gz", index=False, compression="gzip")

In [44]:
hum_final = pd.read_csv("../data/interim/humsavar_full.csv.gz")

In [47]:
hum_final.shape

(75769, 7)

## SNVBox Features 

### AA Features

In [48]:
AA_Features = pd.read_csv("../data/external/snvbox_features/AA_Features_snvbox.csv")

In [49]:
AA_Features.head()

Unnamed: 0,WildType,Mut,Charge,Volume,Hydrophobicity,Grantham,Polarity,Ex,PAM250,BLOSUM,JM,HGMD2003,VB,Transition,COSMIC,COSMICvsSWISSPROT,HAPMAP,COSMICvsHAPMAP
0,A,C,0,-0.663333,-0.4,195,-2.6,39,0.3,0,-0.2,0,0,0.0014,-10.586231,-13.556968,-9.012255,-1.573976
1,A,D,1,-0.75,10.8,126,4.9,19,-0.3,-2,0.22,111,5,0.0028,-7.408177,-10.378914,-5.183614,-2.224563
2,A,E,1,-1.66,9.8,107,4.2,27,-0.1,-1,0.21,54,0,0.0047,-6.71503,-9.685767,-6.447306,-0.267724
3,A,F,0,-3.376667,-2.1,113,-2.9,31,-2.6,-2,-0.49,0,0,0.0028,-8.976793,-11.94753,-7.913643,-1.06315
4,A,G,0,0.95,0.6,60,0.9,39,0.6,0,1.66,34,10,0.0049,-7.641792,-10.612529,-5.004922,-2.63687


In [50]:
hum_final = hum_final.merge(AA_Features, on=['WildType', 'Mut'], how='left')

### Uniprot Features

In [54]:
Uniprot_Features = pd.read_csv("../data/external/snvbox_features/uniprot_features.csv")
Uniprot_Features.rename(columns={"Acc": "UID"}, inplace=True)

In [59]:
hum_final = hum_final.merge(Uniprot_Features, on=['UID', 'Pos'], how='left')

### Genomic MSA

In [5]:
genomic_msa = pd.read_csv("../data/external/snvbox_features/genomic_msa_features.csv").drop_duplicates()
genomic_msa = genomic_msa.groupby(["UID", "Pos"]).mean().reset_index()

In [6]:
hum_final = hum_final.merge(genomic_msa, on=['UID', 'Pos'], how='left')

NameError: name 'hum_final' is not defined

## Protparam

In [60]:
protparam = pd.read_csv("../data/interim/protparam_features.csv.gz")

In [61]:
protparam.shape

(75760, 11)

In [62]:
hum_final = hum_final.merge(protparam, on="MUTANT", how="left")

In [63]:
hum_final.head()

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE,Charge,Volume,Hydrophobicity,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism,0.0,-0.673333,9.3,...,0.004167,-0.064539,0.017212,-0.003946,0.2075,-2.867899,3.968333,0.298545,0.008954,-0.009018
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism,0.0,-0.673333,9.3,...,0.0,,1.154663,-0.192039,0.40125,-0.362373,15.067917,-0.297907,0.017434,-0.01741
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism,0.0,-0.763333,-0.8,...,0.0,,0.0,0.0,0.052083,-0.495321,2.38625,-0.034999,0.003321,0.003306
3,Q9NQ94-558-A-S,Q9NQ94,558,A,S,rs11817448,Polymorphism,0.0,-0.013333,1.0,...,0.0,,0.0,0.0,0.25875,0.69074,14.680833,-0.329532,0.011931,-0.011806
4,A8K2U0-207-G-R,A8K2U0,207,G,R,rs11047499,Polymorphism,-1.0,-3.776667,13.3,...,0.05,0.223144,0.533203,-0.118123,0.460417,-4.313034,7.2975,-0.191561,0.008191,-0.008252


## Phylogenetic Features

In [20]:
phyloP46way = pd.read_csv("../data/interim/phyloP46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phyloP46way, on="dbSNP", how="left")

phastCons46way = pd.read_csv("../data/interim/phastCons46way.csv").drop_duplicates("dbSNP")
hum_final = hum_final.merge(phastCons46way, on="dbSNP", how="left")

## Create Train Test Sets

In [64]:
hum_final.columns = hum_final.columns.str.upper()

In [65]:
hum_final[hum_final.MUTANT.duplicated(keep=False)]

Unnamed: 0,MUTANT,UID,POS,WILDTYPE,MUT,DBSNP,TYPE,CHARGE,VOLUME,HYDROPHOBICITY,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO


In [66]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

In [67]:
hum_final.replace([np.inf,-np.inf], np.nan, inplace=True)
hum_final.to_csv("../data/processed/1/humsavar_full_gt.csv.gz", index=False, compression="gzip")

In [69]:
hum_final.columns.shape

(61,)

In [70]:
hum_final = pd.read_csv("../data/processed/1/humsavar_full_gt.csv.gz")

In [71]:
hum_final.head()

Unnamed: 0,MUTANT,UID,POS,WILDTYPE,MUT,DBSNP,TYPE,CHARGE,VOLUME,HYDROPHOBICITY,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism,0.0,-0.673333,9.3,...,0.004167,-0.064539,0.017212,-0.003946,0.2075,-2.867899,3.968333,0.298545,0.008954,-0.009018
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism,0.0,-0.673333,9.3,...,0.0,,1.154663,-0.192039,0.40125,-0.362373,15.067917,-0.297907,0.017434,-0.01741
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism,0.0,-0.763333,-0.8,...,0.0,,0.0,0.0,0.052083,-0.495321,2.38625,-0.034999,0.003321,0.003306
3,Q9NQ94-558-A-S,Q9NQ94,558,A,S,rs11817448,Polymorphism,0.0,-0.013333,1.0,...,0.0,,0.0,0.0,0.25875,0.69074,14.680833,-0.329532,0.011931,-0.011806
4,A8K2U0-207-G-R,A8K2U0,207,G,R,rs11047499,Polymorphism,-1.0,-3.776667,13.3,...,0.05,0.223144,0.533203,-0.118123,0.460417,-4.313034,7.2975,-0.191561,0.008191,-0.008252


In [72]:
hum_final.drop(["UID", "POS","DBSNP", "WILDTYPE", "MUT"], axis=1, inplace=True)

dataset = hum_final.set_index("MUTANT")
unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

In [73]:
dataset.to_csv("../data/processed/1/dataset.csv.gz", index=True, compression="gzip")

In [74]:
hum_final.describe()

Unnamed: 0,CHARGE,VOLUME,HYDROPHOBICITY,GRANTHAM,POLARITY,EX,PAM250,BLOSUM,JM,HGMD2003,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO
count,75768.0,75768.0,75768.0,75768.0,75768.0,75768.0,75768.0,75768.0,75768.0,75768.0,...,75760.0,50444.0,75760.0,75760.0,75751.0,68065.0,75751.0,71494.0,74827.0,74827.0
mean,-0.006018,-0.16429,-0.66579,80.05992,-0.270291,29.001676,0.14901,-0.587478,0.801018,189.723115,...,0.016499,-0.024049,0.696105,0.004816,0.229133,0.03693,14.056007,0.088132,0.009325164,0.000932
std,0.712793,1.703196,6.813038,47.977396,2.735583,10.94137,1.682386,1.648722,1.238418,154.357797,...,0.024859,0.260053,0.979716,0.173053,0.170877,1.967065,13.283534,0.685413,0.006776124,0.011499
min,-2.0,-5.59,-16.0,5.0,-8.1,-1.0,-5.4,-4.0,-1.73,0.0,...,0.0,-0.81093,0.0,-0.835349,0.0,-38.096066,0.00125,-7.946853,2.834467e-07,-0.046367
25%,0.0,-1.403333,-3.1,43.0,-2.2,21.0,-1.0,-2.0,-0.5,72.0,...,0.0,-0.064539,0.0,-0.020843,0.092083,-0.243418,4.197917,-0.157569,0.003943452,-0.007167
50%,0.0,-0.16,-0.5,74.0,-0.1,29.0,0.2,-1.0,1.05,130.0,...,0.004167,-0.064539,0.171143,0.0,0.197083,0.02928,10.11375,0.032271,0.007960034,0.001172
75%,0.0,0.963333,1.9,102.0,1.1,35.0,1.4,1.0,1.66,277.0,...,0.0125,-0.064539,1.222412,0.035831,0.330417,0.284512,20.194375,0.328105,0.01339697,0.008731
max,2.0,5.59,15.7,215.0,8.1,61.0,5.3,3.0,3.22,634.0,...,0.222222,1.034074,6.305908,0.872352,1.673611,38.367878,139.144444,7.964569,0.04844643,0.044189


In [75]:
hum_final.shape

(75769, 56)

In [76]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

In [53]:
hum_final.head()

Unnamed: 0,MUTANT,TYPE,CHARGE,VOLUME,HYDROPHOBICITY,GRANTHAM,POLARITY,EX,PAM250,BLOSUM,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO
0,P04217-52-H-R,Polymorphism,0.0,-0.673333,9.3,29.0,0.1,28.0,1.0,0.0,...,0.004167,-0.064539,0.017212,-0.003946,0.2075,-2.867899,3.968333,0.298545,0.008954,-0.009018
1,P04217-395-H-R,Polymorphism,0.0,-0.673333,9.3,29.0,0.1,28.0,1.0,0.0,...,0.0,,1.154663,-0.192039,0.40125,-0.362373,15.067917,-0.297907,0.017434,-0.01741
2,Q9NQ94-555-V-M,Polymorphism,0.0,-0.763333,-0.8,21.0,-0.2,56.0,1.8,1.0,...,0.0,,0.0,0.0,0.052083,-0.495321,2.38625,-0.034999,0.003321,0.003306
3,Q9NQ94-558-A-S,Polymorphism,0.0,-0.013333,1.0,99.0,1.1,38.0,1.1,1.0,...,0.0,,0.0,0.0,0.25875,0.69074,14.680833,-0.329532,0.011931,-0.011806
4,A8K2U0-207-G-R,Polymorphism,-1.0,-3.776667,13.3,125.0,1.5,18.0,-1.0,-2.0,...,0.05,0.223144,0.533203,-0.118123,0.460417,-4.313034,7.2975,-0.191561,0.008191,-0.008252


In [14]:
dataset

Unnamed: 0_level_0,TYPE,CHARGE,VOLUME,HYDROPHOBICITY,GRANTHAM,POLARITY,EX,PAM250,BLOSUM,JM,...,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,GRAVY_DIFF,GRAVY_LOG_RATIO,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO,FLEXIBILITY_DIFF,FLEXIBILITY_LOG_RATIO
MUTANT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P04217-52-H-R,Polymorphism,0.0,-0.673333,9.3,29.0,0.1,28.0,1.0,0.0,1.24,...,0.004167,-0.064539,0.017212,-0.003946,0.207500,-2.867899,3.968333,0.298545,0.008954,-0.009018
P04217-395-H-R,Polymorphism,0.0,-0.673333,9.3,29.0,0.1,28.0,1.0,0.0,1.24,...,0.000000,,1.154663,-0.192039,0.401250,-0.362373,15.067917,-0.297907,0.017434,-0.017410
Q9NQ94-555-V-M,Polymorphism,0.0,-0.763333,-0.8,21.0,-0.2,56.0,1.8,1.0,1.24,...,0.000000,,0.000000,0.000000,0.052083,-0.495321,2.386250,-0.034999,0.003321,0.003306
Q9NQ94-558-A-S,Polymorphism,0.0,-0.013333,1.0,99.0,1.1,38.0,1.1,1.0,1.27,...,0.000000,,0.000000,0.000000,0.258750,0.690740,14.680833,-0.329532,0.011931,-0.011806
A8K2U0-207-G-R,Polymorphism,-1.0,-3.776667,13.3,125.0,1.5,18.0,-1.0,-2.0,1.24,...,0.050000,0.223144,0.533203,-0.118123,0.460417,-4.313034,7.297500,-0.191561,0.008191,-0.008252
A8K2U0-970-C-Y,Polymorphism,0.0,-2.836667,2.7,194.0,0.7,35.0,-0.4,-2.0,0.25,...,0.066667,,0.000000,0.000000,0.207083,1.220660,15.952500,0.287148,0.001569,-0.001544
A8K2U0-1131-T-M,Polymorphism,0.0,-1.560000,-2.2,81.0,-2.9,15.0,-0.4,-1.0,-0.92,...,0.004167,-0.064539,0.000000,0.000000,0.186250,0.367038,15.647083,-1.746165,0.011477,0.011408
A8K2U0-1412-T-A,Polymorphism,0.0,0.916667,-0.4,58.0,-0.5,40.0,0.7,0.0,1.61,...,0.008333,-0.064539,0.000000,0.000000,0.374583,4.046335,9.783750,,0.006895,0.006817
A8K2U0-850-D-E,Polymorphism,0.0,-0.910000,-1.0,45.0,-0.7,34.0,2.9,2.0,2.53,...,0.004167,-0.064539,0.816528,-0.147887,0.170417,0.199151,2.962083,-0.052990,0.005122,0.005054
A8K2U0-1229-H-R,Polymorphism,0.0,-0.673333,9.3,29.0,0.1,28.0,1.0,0.0,1.24,...,0.012500,-0.064539,1.424194,-0.153741,0.125000,0.485508,16.992917,0.325852,0.003143,-0.003125


In [77]:
for i in range(20):
    train = pd.concat([
        dataset.loc[dataset_disease_index].sample(n=25000, random_state=i),
        dataset.loc[dataset_poly_index].sample(n=25000, random_state=i)
    ]).sample(frac=1)
    
    test = pd.concat([
        dataset.loc[dataset_disease_index.difference(train.index)].sample(n=3855, random_state=i),
        dataset.loc[dataset_poly_index.difference(train.index)].sample(n=14653, random_state=i)
    ]).sample(frac=1)
    train.to_csv("../data/processed/1/train_test_sets/train_{}.csv".format(i), index=True, index_label="MUTANT")
    test.to_csv("../data/processed/1/train_test_sets/test_{}.csv".format(i), index=True, index_label="MUTANT")