In [None]:
import pandas as pd
import numpy as np

## Humsavar Table

In [None]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")
hum.columns = hum.columns.str.replace(" ", "_")

In [None]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

In [None]:
hum.columns

In [None]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-"+ df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")
# mutant = pd.Series(data=(df.C1 + df.C2 + df.C3).tolist(), name="MUTANT")

In [None]:
hum_final = pd.concat([hum, mutant, df], 1)[["MUTANT", "Swiss_Prot_AC", "C2", "C1", "C3","dbSNP", "Type_of_variant"]]
hum_final.rename(columns={"C1": "WildType", "C3": "Mut", "Swiss_Prot_AC": "UID", "C2": "Pos"}, inplace=True)
# hum_final = pd.concat([hum, mutant], 1)[["FTId", "MUTANT"]]

In [None]:
hum_final.head()

In [None]:
hum_final.shape

In [None]:
hum_final.drop_duplicates("MUTANT", inplace=True)
# hum_final.drop_duplicates("dbSNP", inplace=True)
hum_final.rename(columns={"Type_of_variant": "TYPE"}, inplace=True)

In [None]:
hum_final.head(3)

In [None]:
hum_final[hum_final.dbSNP.duplicated(keep=False) & ~hum_final.dbSNP.isnull()]

In [None]:
hum_final.to_csv("../data/interim/humsavar_full.csv.gz", index=False, compression="gzip")
# hum_final.to_csv("../data/interim/humsavar_VEST.csv.gz", index=False, compression="gzip")

In [None]:
hum_final = pd.read_csv("../data/interim/humsavar_full.csv.gz")

In [None]:
hum_final.shape

## SNVBox Features 

### AA Features (Protein)

In [None]:
AA_Features = pd.read_csv("../data/external/snvbox_features/AA_Features_snvbox.csv")

In [None]:
AA_Features.columns

In [None]:
AA_Features.head()

In [None]:
hum_final = hum_final.merge(AA_Features, on=['WildType', 'Mut'], how='left')

### Uniprot Features (Protein)

In [None]:
Uniprot_Features = pd.read_csv("../data/external/snvbox_features/uniprot_features.csv")
Uniprot_Features.rename(columns={"Acc": "UID"}, inplace=True)

In [None]:
Uniprot_Features

In [None]:
Uniprot_Features.shape

In [None]:
Uniprot_Features.columns.str.upper()

In [None]:
hum_final = hum_final.merge(Uniprot_Features, on=['UID', 'Pos'], how='left')

## Protparam (Protein)

In [None]:
protparam = pd.read_csv("../data/interim/protparam_features.csv.gz")

In [None]:
hum_final = hum_final.merge(protparam, on="MUTANT", how="left")

## Create Structural Dataset

In [None]:
hum_final.columns = hum_final.columns.str.upper()

In [None]:
hum_final.columns

In [None]:
hum_final.TYPE.value_counts()

In [None]:
hum_final.replace([np.inf,-np.inf], np.nan, inplace=True)
hum_final.to_csv("../data/processed/structural/humsavar_full_gt.csv.gz", index=False, compression="gzip")

In [None]:
hum_final.columns.shape

In [None]:
hum_final = pd.read_csv("../data/processed/structural/humsavar_full_gt.csv.gz")

In [None]:
hum_final.shape

In [None]:
hum_final.isnull().sum()

In [None]:
hum_final.drop(["UID", "POS","DBSNP", "WILDTYPE", "MUT"], axis=1, inplace=True)
hum_final = hum_final[hum_final.TYPE != "Unclassified"]
dataset = hum_final.set_index("MUTANT")
# unclassified_index = dataset[dataset.TYPE == "Unclassified"].index
# dataset_disease_index = dataset[dataset.TYPE == "Disease"].index
# dataset_poly_index = dataset[dataset.TYPE == "Polymorphism"].index

In [None]:
dataset.to_csv("../data/processed/structural/dataset.csv.gz", index=True, compression="gzip")

In [None]:
dataset = pd.read_csv("../data/processed/structural/dataset.csv.gz", index_col="MUTANT")

In [None]:
dataset[Uniprot_Features.columns.str.upper().drop(["UID", "POS"],1 )].describe()

In [None]:
dataset.describe().to_csv("/home/marlan/workspace/master-thesis/results/estructural/describe.csv", index=True)

### Single Train Test

### Multiple Train Tests

In [None]:
for i in range(20):
    train = pd.concat([
        dataset.loc[dataset_disease_index].sample(n=25000, random_state=i),
        dataset.loc[dataset_poly_index].sample(n=25000, random_state=i)
    ]).sample(frac=1)
    
    test = pd.concat([
        dataset.loc[dataset_disease_index.difference(train.index)].sample(n=3855, random_state=i),
        dataset.loc[dataset_poly_index.difference(train.index)].sample(n=14653, random_state=i)
    ]).sample(frac=1)
    train.to_csv("../data/processed/3/train_test_sets/train_{}.csv".format(i), index=True, index_label="MUTANT")
    test.to_csv("../data/processed/3/train_test_sets/test_{}.csv".format(i), index=True, index_label="MUTANT")