In [2]:
import pandas as pd

### Levanto la tabla de la pagina Humsavar sin el prologo y la coda

In [3]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")

In [4]:
hum.columns = hum.columns.str.replace(" ", "_")

In [5]:
hum.shape

(76730, 7)

In [6]:
hum.Type_of_variant.value_counts()

Polymorphism    39655
Disease         29726
Unclassified     7349
Name: Type_of_variant, dtype: int64

In [7]:
hum.columns

Index(['Main_gene_name', 'Swiss_Prot_AC', 'FTId', 'AA_Change',
       'Type_of_variant', 'dbSNP', 'Disease_Name'],
      dtype='object')

### Los duplicados de la tabla humsaVar varian solo en su Disease Name

In [8]:
hum[hum[["Main_gene_name", "Swiss_Prot_AC", "AA_Change"]].duplicated(keep=False)].head()

Unnamed: 0,Main_gene_name,Swiss_Prot_AC,FTId,AA_Change,Type_of_variant,dbSNP,Disease_Name
228,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
229,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Stargardt disease 1 (STGD1) [MIM:248200]
234,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
235,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Stargardt disease 1 (STGD1) [MIM:248200]
240,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Cone-rod dystrophy 3 (CORD3) [MIM:604116]


In [9]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

Formateo la tabla para coincidir con formato Uniprot-Pos-Amino1-Amino2

In [10]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [11]:
hum_final = pd.concat([hum, mutant], 1)[["Main_gene_name", "Type_of_variant", "MUTANT", "AA_Change", "dbSNP"]]

In [12]:
hum_final.head()

Unnamed: 0,Main_gene_name,Type_of_variant,MUTANT,AA_Change,dbSNP
0,A1BG,Polymorphism,P04217-52-H-R,p.His52Arg,rs893184
1,A1BG,Polymorphism,P04217-395-H-R,p.His395Arg,rs2241788
2,A1CF,Polymorphism,Q9NQ94-555-V-M,p.Val555Met,rs9073
3,A1CF,Polymorphism,Q9NQ94-558-A-S,p.Ala558Ser,rs11817448
4,A2ML1,Polymorphism,A8K2U0-207-G-R,p.Gly207Arg,rs11047499


In [13]:
hum_final.shape

(76730, 5)

In [18]:
for e, group in enumerate(hum_final[hum_final.duplicated(keep=False)].groupby("MUTANT")["dbSNP"]):
    if group[1].value_counts().shape[0] > 1:
        print(group[1].value_counts())
        print("ERROR!")

In [16]:
hum_final.Type_of_variant.isnull().sum()

0

In [17]:
hum_final.drop_duplicates(inplace=True)

In [18]:
hum_final.shape

(75802, 5)

### Levanto tabla clinvar de Santi

In [19]:
clinvar = pd.read_csv("../data/interim/mut-clinvar.tab.gz", sep="\t", header=None, names=["MUTANT", "Type_of_variant"])

In [20]:
clinvar.shape

(7631, 2)

In [21]:
clinvar.head()

Unnamed: 0,MUTANT,Type_of_variant
0,Q14896-432-A-T,Uncertain significance
1,Q86YC2-862-K-N,Uncertain significance
2,Q07889-663-I-V,Likely benign
3,Q14896-429-A-E,Uncertain significance
4,P12883-204-R-C,Pathogenic


In [22]:
clinvar.Type_of_variant.value_counts().head()

Uncertain significance    2722
Pathogenic                2435
not provided              1029
Likely pathogenic          627
Likely benign              154
Name: Type_of_variant, dtype: int64

In [23]:
clinvar.drop_duplicates(inplace=True)

In [24]:
clinvar.shape

(7580, 2)

In [25]:
clinvar.Type_of_variant.isnull().sum()

0

### Combino la tabla humsavar y clinvar para ver clasificaciones

In [26]:
combined = clinvar.merge(hum_final, right_on="MUTANT", left_on="MUTANT", suffixes=["_ClinVar", "_HumsaVar"], how="outer")

In [27]:
combined.head()

Unnamed: 0,MUTANT,Type_of_variant_ClinVar,Main_gene_name,Type_of_variant_HumsaVar,AA_Change,dbSNP
0,Q14896-432-A-T,Uncertain significance,,,,
1,Q86YC2-862-K-N,Uncertain significance,,,,
2,Q07889-663-I-V,Likely benign,,,,
3,Q14896-429-A-E,Uncertain significance,,,,
4,P12883-204-R-C,Pathogenic,,,,


#### Interseccion

In [28]:
combined[~combined.Type_of_variant_ClinVar.isnull() & ~combined.Type_of_variant_HumsaVar.isnull()].shape

(2831, 6)

In [29]:
combined[combined.Type_of_variant_ClinVar == "not provided"][["Main_gene_name","Type_of_variant_HumsaVar"]].isnull().sum()

Main_gene_name              684
Type_of_variant_HumsaVar    684
dtype: int64

### Merge con tabla VarQ

In [30]:
varq = pd.read_csv("../data/processed/properties-varq.tab.gz", sep="\t")

In [31]:
varq.shape

(17869, 12)

In [32]:
combined_varq = varq.merge(combined, left_on="MUTANT", right_on="MUTANT", how="left")

In [33]:
combined_varq.columns

Index(['MUTANT', 'SASA', 'SASA_PERCENTAGE', 'BFACTOR', 'SWITCHBILITY',
       'AGGREGABILITY', 'CONSERVATION', '3DID', 'PDB', 'ACTIVE_SITE',
       'VARIATION_ENERGY', 'TYPE', 'Type_of_variant_ClinVar', 'Main_gene_name',
       'Type_of_variant_HumsaVar', 'AA_Change', 'dbSNP'],
      dtype='object')

In [34]:
combined_varq.shape

(17893, 17)

In [35]:
combined_varq[~combined_varq.Type_of_variant_HumsaVar.isnull()][["Type_of_variant_HumsaVar", "TYPE"]].groupby(["Type_of_variant_HumsaVar", "TYPE"]).size().unstack(fill_value=0)

TYPE,Benign,Pathogenic
Type_of_variant_HumsaVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Disease,0,4675
Polymorphism,1917,66
Unclassified,553,60


In [36]:
combined_varq[~combined_varq.Type_of_variant_ClinVar.isnull()][["Type_of_variant_ClinVar", "TYPE"]].groupby(["Type_of_variant_ClinVar", "TYPE"]).size().unstack(fill_value=0).head()

TYPE,Benign,Pathogenic
Type_of_variant_ClinVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Affects,0,3
Benign,123,15
Benign;Likely benign,36,3
Benign;Likely benign;Likely pathogenic;Uncertain significance,0,1
Benign;Likely benign;Pathogenic,0,3


In [37]:
unlabeled = combined_varq[combined_varq.Type_of_variant_ClinVar.isnull() & combined_varq.Type_of_variant_HumsaVar.isnull()]

In [38]:
unlabeled.shape # Cantidad de MUTANTS no clasificados

(5863, 17)

In [39]:
combined_varq.MUTANT.duplicated().sum()

24

In [40]:
combined_varq[~combined_varq["Type_of_variant_HumsaVar"].isnull()]["Type_of_variant_HumsaVar"].value_counts()

Disease         4675
Polymorphism    1983
Unclassified     613
Name: Type_of_variant_HumsaVar, dtype: int64

## Ver si los unlabeled aparecen en la tabla variants

### Tabla de Variantes que encontre en el sitio ClinVar

In [41]:
variants = pd.read_csv("/home/marlan/Downloads/variant_summary.txt.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [42]:
variants.shape

(668042, 30)

In [43]:
pd.set_option('display.max_colwidth', -1)
variants.head(5)

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
0,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA (p.Arg27_Ala362delinsLeuLeuTer),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4820847,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
1,15041,indel,NM_014855.2(AP5Z1):c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA (p.Arg27_Ala362delinsLeuLeuTer),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704705,...,4781216,GGAT,TGCTGTAAACTGTAACTGTAAA,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0001,1
2,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGCT (p.Leu473Glyfs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4827379,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
3,15042,deletion,NM_014855.2(AP5Z1):c.1413_1426delGGACCTGCCCTGCT (p.Leu473Glyfs),9907,AP5Z1,HGNC:22197,Pathogenic,1,"Jun 29, 2010",397704709,...,4787748,GGACCTGCCCTGCT,-,7p22.1,no assertion criteria provided,1,,N,OMIM Allelic Variant:613653.0002,1
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,85342440,G,A,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (protein):Q92610#VAR_064583",1


In [67]:
((variants["RS# (dbSNP)"] == -1) & (variants["Type"] == "single nucleotide variant")).sum()

7568

In [65]:
variants.shape

(668042, 30)

In [77]:
variants.ClinicalSignificance.value_counts().head()

Uncertain significance    261222
Pathogenic                116513
Likely benign             112252
Benign                    57708 
Likely pathogenic         32748 
Name: ClinicalSignificance, dtype: int64

In [78]:
variants.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories'],
      dtype='object')

In [79]:
amino = variants.Name.str.extract('(?P<C1>[A-Z]{1}[a-z]{2})(?P<C2>\d+)(?P<C3>[A-Z]{1}[a-z]{2})', expand=True)
protein = variants.OtherIDs.str.extract('\(protein\):(?P<Protein>[A-Z,\d]+)#', expand=True)
amino["C1"] = df.C1.map(AMINO_CODE)
amino["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(protein.Protein + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [80]:
unlabeled.MUTANT.isin(mutant).sum()

0

### No, no hay ni un unlabeled que podamos clasificar usando variants.

## Genero Ground Truth para ClinVar y para HumsaVar

In [52]:
humsavar_GT = combined_varq[~combined_varq["Type_of_variant_HumsaVar"].isnull()]
clinvar_GT = combined_varq[~combined_varq["Type_of_variant_ClinVar"].isnull()]

In [59]:
cols = varq.columns.difference(["TYPE"]).tolist() + ["Type_of_variant_HumsaVar", "dbSNP"] 

In [55]:
varq.columns

Index(['MUTANT', 'SASA', 'SASA_PERCENTAGE', 'BFACTOR', 'SWITCHBILITY',
       'AGGREGABILITY', 'CONSERVATION', '3DID', 'PDB', 'ACTIVE_SITE',
       'VARIATION_ENERGY', 'TYPE'],
      dtype='object')

In [56]:
humsavar_GT = humsavar_GT[cols]
humsavar_GT.rename(columns={"Type_of_variant_HumsaVar": "TYPE"}, inplace=True)

In [57]:
clinvar_GT[cols].set_index("MUTANT").to_csv("../data/processed/clinvar_gt.tab.gz", sep="\t", compression="gzip", index=True)

In [58]:
humsavar_GT.set_index("MUTANT").to_csv("../data/processed/humsavar_gt.tab.gz", sep="\t", compression="gzip", index=True)

In [50]:
humsavar_GT.set_index("MUTANT").shape

(7271, 11)

In [51]:
clinvar_GT[cols].set_index("MUTANT").shape

(7590, 11)