In [9]:
import pandas as pd
import numpy as np

## Humsavar Table

In [10]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")
hum.columns = hum.columns.str.replace(" ", "_")

In [11]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

In [12]:
hum.columns

Index(['Main_gene_name', 'Swiss_Prot_AC', 'FTId', 'AA_Change',
       'Type_of_variant', 'dbSNP', 'Disease_Name'],
      dtype='object')

In [13]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-"+ df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")
# mutant = pd.Series(data=(df.C1 + df.C2 + df.C3).tolist(), name="MUTANT")

In [14]:
hum_final = pd.concat([hum, mutant, df], 1)[["MUTANT", "Swiss_Prot_AC", "C2", "C1", "C3","dbSNP", "Type_of_variant"]]
hum_final.rename(columns={"C1": "WildType", "C3": "Mut", "Swiss_Prot_AC": "UID", "C2": "Pos"}, inplace=True)
# hum_final = pd.concat([hum, mutant], 1)[["FTId", "MUTANT"]]

In [15]:
hum_final.head()

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,Type_of_variant
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism
3,Q9NQ94-558-A-S,Q9NQ94,558,A,S,rs11817448,Polymorphism
4,A8K2U0-207-G-R,A8K2U0,207,G,R,rs11047499,Polymorphism


In [16]:
hum_final.shape

(76730, 7)

In [17]:
hum_final.drop_duplicates("MUTANT", inplace=True)
# hum_final.drop_duplicates("dbSNP", inplace=True)
hum_final.rename(columns={"Type_of_variant": "TYPE"}, inplace=True)

In [18]:
hum_final.head(3)

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
0,P04217-52-H-R,P04217,52,H,R,rs893184,Polymorphism
1,P04217-395-H-R,P04217,395,H,R,rs2241788,Polymorphism
2,Q9NQ94-555-V-M,Q9NQ94,555,V,M,rs9073,Polymorphism


In [19]:
hum_final[hum_final.dbSNP.duplicated(keep=False) & ~hum_final.dbSNP.isnull()]

Unnamed: 0,MUTANT,UID,Pos,WildType,Mut,dbSNP,TYPE
97,Q86UK0-459-S-T,Q86UK0,459,S,T,rs7560008,Polymorphism
104,Q86UK0-777-S-T,Q86UK0,777,S,T,rs7560008,Polymorphism
145,O95477-590-W-S,O95477,590,W,S,rs137854496,Disease
191,O95477-590-W-L,O95477,590,W,L,rs137854496,Disease
231,P78363-96-N-D,P78363,96,N,D,rs61748529,Disease
232,P78363-96-N-H,P78363,96,N,H,rs61748529,Disease
249,P78363-572-R-P,P78363,572,R,P,rs61748559,Disease
250,P78363-572-R-Q,P78363,572,R,Q,rs61748559,Disease
332,P78363-2077-R-W,P78363,2077,R,W,rs61750645,Disease
353,P78363-68-P-L,P78363,68,P,L,rs62654397,Disease


In [20]:
hum_final.to_csv("../data/interim/humsavar_full.csv.gz", index=False, compression="gzip")
# hum_final.to_csv("../data/interim/humsavar_VEST.csv.gz", index=False, compression="gzip")

In [55]:
hum_final = pd.read_csv("../data/interim/humsavar_full.csv.gz")

In [56]:
hum_final.shape

(75769, 7)

## SNVBox Features 

### AA Features (Protein)

In [57]:
AA_Features = pd.read_csv("../data/external/snvbox_features/AA_Features_snvbox.csv")

In [58]:
AA_Features.columns

Index(['WildType', 'Mut', 'Charge', 'Volume', 'Hydrophobicity', 'Grantham',
       'Polarity', 'Ex', 'PAM250', 'BLOSUM', 'JM', 'HGMD2003', 'VB',
       'Transition', 'COSMIC', 'COSMICvsSWISSPROT', 'HAPMAP',
       'COSMICvsHAPMAP'],
      dtype='object')

In [59]:
AA_Features.head()

Unnamed: 0,WildType,Mut,Charge,Volume,Hydrophobicity,Grantham,Polarity,Ex,PAM250,BLOSUM,JM,HGMD2003,VB,Transition,COSMIC,COSMICvsSWISSPROT,HAPMAP,COSMICvsHAPMAP
0,A,C,0,-0.663333,-0.4,195,-2.6,39,0.3,0,-0.2,0,0,0.0014,-10.586231,-13.556968,-9.012255,-1.573976
1,A,D,1,-0.75,10.8,126,4.9,19,-0.3,-2,0.22,111,5,0.0028,-7.408177,-10.378914,-5.183614,-2.224563
2,A,E,1,-1.66,9.8,107,4.2,27,-0.1,-1,0.21,54,0,0.0047,-6.71503,-9.685767,-6.447306,-0.267724
3,A,F,0,-3.376667,-2.1,113,-2.9,31,-2.6,-2,-0.49,0,0,0.0028,-8.976793,-11.94753,-7.913643,-1.06315
4,A,G,0,0.95,0.6,60,0.9,39,0.6,0,1.66,34,10,0.0049,-7.641792,-10.612529,-5.004922,-2.63687


In [60]:
hum_final = hum_final.merge(AA_Features, on=['WildType', 'Mut'], how='left')

### Uniprot Features (Protein)

In [87]:
Uniprot_Features = pd.read_csv("../data/external/snvbox_features/uniprot_features.csv")
Uniprot_Features.rename(columns={"Acc": "UID"}, inplace=True)

In [99]:
Uniprot_Features

Unnamed: 0,UID,Pos,BINDING,ACT_SITE,SITE,LIPID,METAL,CARBOHYD,DNA_BIND,NP_BIND,...,ZN_FING,REGIONS,PPI,RNABD,TF,LOC,MMBRBD,Chrom,PostModRec,PostModEnz
0,P01023,991,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,P01023,396,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,P01023,1424,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,P01023,277,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,P01023,278,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,P01023,410,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,P01023,70,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,P01023,430,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,P01023,431,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,P01023,690,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [88]:
Uniprot_Features.shape

(10, 30)

In [92]:
Uniprot_Features.columns.str.upper()

Index(['UID', 'POS', 'BINDING', 'ACT_SITE', 'SITE', 'LIPID', 'METAL',
       'CARBOHYD', 'DNA_BIND', 'NP_BIND', 'CA_BIND', 'DISULFID', 'SE_CYS',
       'MOD_RES', 'PROPEP', 'SIGNALP', 'TRANSMEM', 'COMPBIAS', 'REP', 'MOTIF',
       'ZN_FING', 'REGIONS', 'PPI', 'RNABD', 'TF', 'LOC', 'MMBRBD', 'CHROM',
       'POSTMODREC', 'POSTMODENZ'],
      dtype='object')

In [64]:
hum_final = hum_final.merge(Uniprot_Features, on=['UID', 'Pos'], how='left')

## Protparam (Protein)

In [72]:
protparam = pd.read_csv("../data/interim/protparam_features.csv.gz")

In [73]:
hum_final = hum_final.merge(protparam, on="MUTANT", how="left")

## Create Structural Dataset

In [32]:
hum_final.columns = hum_final.columns.str.upper()

In [33]:
hum_final.columns

Index(['MUTANT', 'UID', 'POS', 'WILDTYPE', 'MUT', 'DBSNP', 'TYPE', 'CHARGE',
       'VOLUME', 'HYDROPHOBICITY', 'GRANTHAM', 'POLARITY', 'EX', 'PAM250',
       'BLOSUM', 'JM', 'HGMD2003', 'VB', 'TRANSITION', 'COSMIC',
       'COSMICVSSWISSPROT', 'HAPMAP', 'COSMICVSHAPMAP', 'BINDING', 'ACT_SITE',
       'SITE', 'LIPID', 'METAL', 'CARBOHYD', 'DNA_BIND', 'NP_BIND', 'CA_BIND',
       'DISULFID', 'SE_CYS', 'MOD_RES', 'PROPEP', 'SIGNALP', 'TRANSMEM',
       'COMPBIAS', 'REP', 'MOTIF', 'ZN_FING', 'REGIONS', 'PPI', 'RNABD', 'TF',
       'LOC', 'MMBRBD', 'CHROM', 'POSTMODREC', 'POSTMODENZ',
       'AROMATICITY_DIFF', 'AROMATICITY_LOG_RATIO', 'ISOELECTRIC_POINT_DIFF',
       'ISOELECTRIC_POINT_LOG_RATIO', 'GRAVY_DIFF', 'GRAVY_LOG_RATIO',
       'INSTABILITY_INDEX_DIFF', 'INSTABILITY_INDEX_LOG_RATIO',
       'FLEXIBILITY_DIFF', 'FLEXIBILITY_LOG_RATIO'],
      dtype='object')

In [34]:
hum_final.TYPE.value_counts()

Polymorphism    39653
Disease         28855
Unclassified     7261
Name: TYPE, dtype: int64

In [36]:
hum_final.replace([np.inf,-np.inf], np.nan, inplace=True)
hum_final.to_csv("../data/processed/structural/humsavar_full_gt.csv.gz", index=False, compression="gzip")

In [37]:
hum_final.columns.shape

(61,)

In [74]:
hum_final = pd.read_csv("../data/processed/structural/humsavar_full_gt.csv.gz")

In [75]:
hum_final.shape

(75769, 61)

In [76]:
hum_final.isnull().sum()

MUTANT                             0
UID                                0
POS                                0
WILDTYPE                           0
MUT                                0
DBSNP                          17590
TYPE                               0
CHARGE                             1
VOLUME                             1
HYDROPHOBICITY                     1
GRANTHAM                           1
POLARITY                           1
EX                                 1
PAM250                             1
BLOSUM                             1
JM                                 1
HGMD2003                           1
VB                                 1
TRANSITION                         1
COSMIC                             1
COSMICVSSWISSPROT                  1
HAPMAP                             1
COSMICVSHAPMAP                     1
BINDING                        50638
ACT_SITE                       50638
SITE                           50638
LIPID                          50638
M

In [80]:
hum_final.drop(["UID", "POS","DBSNP", "WILDTYPE", "MUT"], axis=1, inplace=True)
hum_final = hum_final[hum_final.TYPE != "Unclassified"]
dataset = hum_final.set_index("MUTANT")
# unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
# dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
# dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

In [82]:
dataset.to_csv("../data/processed/structural/dataset.csv.gz", index=True, compression="gzip")

In [96]:
dataset = pd.read_csv("../data/processed/structural/dataset.csv.gz", index_col="MUTANT")

In [98]:
dataset[Uniprot_Features.columns.str.upper().drop(["UID", "POS"],1 )].describe()

Unnamed: 0,BINDING,ACT_SITE,SITE,LIPID,METAL,CARBOHYD,DNA_BIND,NP_BIND,CA_BIND,DISULFID,...,ZN_FING,REGIONS,PPI,RNABD,TF,LOC,MMBRBD,CHROM,POSTMODREC,POSTMODENZ
count,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,...,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0,21621.0
mean,0.005689,0.002313,0.003515,0.000139,0.005596,0.005643,0.024189,0.014292,0.001804,0.017622,...,0.032746,0.299616,0.120531,0.027196,0.008788,0.008279,0.013459,0.000601,0.00444,0.07909
std,0.075212,0.048035,0.059185,0.011779,0.074601,0.074907,0.153641,0.118693,0.042434,0.131575,...,0.177975,0.4581,0.325589,0.162657,0.093332,0.090614,0.115233,0.024514,0.066488,0.269885
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [86]:
dataset.describe().to_csv("/home/marlan/workspace/master-thesis/results/estructural/describe.csv", index=True)

### Single Train Test

### Multiple Train Tests

In [None]:
for i in range(20):
    train = pd.concat([
        dataset.loc[dataset_disease_index].sample(n=25000, random_state=i),
        dataset.loc[dataset_poly_index].sample(n=25000, random_state=i)
    ]).sample(frac=1)
    
    test = pd.concat([
        dataset.loc[dataset_disease_index.difference(train.index)].sample(n=3855, random_state=i),
        dataset.loc[dataset_poly_index.difference(train.index)].sample(n=14653, random_state=i)
    ]).sample(frac=1)
    train.to_csv("../data/processed/3/train_test_sets/train_{}.csv".format(i), index=True, index_label="MUTANT")
    test.to_csv("../data/processed/3/train_test_sets/test_{}.csv".format(i), index=True, index_label="MUTANT")