In [1]:
import pandas as pd

### Levanto la tabla de la pagina Humsavar sin el prologo y la coda

In [2]:
hum = pd.read_csv("data/humsavar_clean.csv.gz", sep=",")

In [3]:
hum.columns = hum.columns.str.replace(" ", "_")

In [4]:
hum.shape

(76730, 7)

In [5]:
hum.Type_of_variant.value_counts()

Polymorphism    39655
Disease         29726
Unclassified     7349
Name: Type_of_variant, dtype: int64

In [6]:
hum.columns

Index([u'Main_gene_name', u'Swiss_Prot_AC', u'FTId', u'AA_Change',
       u'Type_of_variant', u'dbSNP', u'Disease_Name'],
      dtype='object')

### Los duplicados de la tabla humsaVar varian en su dbSNP y Disease Name unicamente

In [7]:
hum[hum[["Main_gene_name", "Swiss_Prot_AC", "AA_Change"]].duplicated(keep=False)]

Unnamed: 0,Main_gene_name,Swiss_Prot_AC,FTId,AA_Change,Type_of_variant,dbSNP,Disease_Name
228,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
229,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Stargardt disease 1 (STGD1) [MIM:248200]
234,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
235,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Stargardt disease 1 (STGD1) [MIM:248200]
240,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
241,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Stargardt disease 1 (STGD1) [MIM:248200]
243,ABCA4,P78363,VAR_008413,p.Glu471Lys,Disease,rs1800548,"Macular degeneration, age-related, 2 (ARMD2) [..."
244,ABCA4,P78363,VAR_008413,p.Glu471Lys,Disease,rs1800548,Stargardt disease 1 (STGD1) [MIM:248200]
246,ABCA4,P78363,VAR_008415,p.Leu541Pro,Disease,rs61751392,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
247,ABCA4,P78363,VAR_008415,p.Leu541Pro,Disease,rs61751392,Fundus flavimaculatus (FFM) [MIM:248200]


In [8]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

Formateo la tabla para coincidir con formato Uniprot-Pos-Amino1-Amino2

In [9]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [10]:
hum_final = pd.concat([hum, mutant], 1)[["Main_gene_name", "Type_of_variant", "MUTANT", "AA_Change"]]

In [11]:
hum_final.shape

(76730, 4)

In [12]:
hum_final.duplicated().sum()

930

In [13]:
hum_final.Type_of_variant.isnull().sum()

0

In [14]:
hum_final.drop_duplicates(inplace=True)

In [15]:
hum_final.shape

(75800, 4)

### Levanto tabla clinvar de Santi

In [16]:
clinvar = pd.read_csv("data/mut-clinvar.tab.gz", sep="\t", header=None, names=["MUTANT", "Type_of_variant"])

In [17]:
clinvar.shape

(7631, 2)

In [18]:
clinvar.head()

Unnamed: 0,MUTANT,Type_of_variant
0,Q14896-432-A-T,Uncertain significance
1,Q86YC2-862-K-N,Uncertain significance
2,Q07889-663-I-V,Likely benign
3,Q14896-429-A-E,Uncertain significance
4,P12883-204-R-C,Pathogenic


In [19]:
clinvar.Type_of_variant.value_counts()

Uncertain significance                                           2722
Pathogenic                                                       2435
not provided                                                     1029
Likely pathogenic                                                 627
Likely benign                                                     154
Likely pathogenic;Pathogenic                                      148
Benign                                                            138
Pathogenic;Uncertain significance                                  67
Likely pathogenic;Uncertain significance                           64
Likely benign;Uncertain significance                               57
Benign;Likely benign                                               39
risk factor                                                        38
drug response                                                      19
Likely pathogenic;Pathogenic;Uncertain significance                13
Benign;Uncertain sig

In [20]:
clinvar.drop_duplicates(inplace=True)

In [21]:
clinvar.shape

(7580, 2)

In [22]:
clinvar.Type_of_variant.isnull().sum()

0

### Combino la tabla humsavar y clinvar para ver clasificaciones

In [23]:
combined = clinvar.merge(hum_final, right_on="MUTANT", left_on="MUTANT", suffixes=["_ClinVar", "_HumsaVar"], how="outer")

In [24]:
combined.head()

Unnamed: 0,MUTANT,Type_of_variant_ClinVar,Main_gene_name,Type_of_variant_HumsaVar,AA_Change
0,Q14896-432-A-T,Uncertain significance,,,
1,Q86YC2-862-K-N,Uncertain significance,,,
2,Q07889-663-I-V,Likely benign,,,
3,Q14896-429-A-E,Uncertain significance,,,
4,P12883-204-R-C,Pathogenic,,,


#### Interseccion

In [25]:
combined[~combined.Type_of_variant_ClinVar.isnull() & ~combined.Type_of_variant_HumsaVar.isnull()].shape

(2831, 5)

In [29]:
combined[combined.Type_of_variant_ClinVar == "not provided"][["Main_gene_name","Type_of_variant_HumsaVar"]].isnull().sum()

Main_gene_name              684
Type_of_variant_HumsaVar    684
dtype: int64

### Merge con tabla VarQ

In [30]:
varq = pd.read_csv("data/properties-varq.tab.gz", sep="\t")

In [31]:
varq.shape

(17869, 12)

In [32]:
combined_varq = varq.merge(combined, left_on="MUTANT", right_on="MUTANT", how="left")

In [33]:
combined_varq.columns

Index([u'MUTANT', u'SASA', u'SASA_PERCENTAGE', u'BFACTOR', u'SWITCHBILITY',
       u'AGGREGABILITY', u'CONSERVATION', u'3DID', u'PDB', u'ACTIVE_SITE',
       u'VARIATION_ENERGY', u'TYPE', u'Type_of_variant_ClinVar',
       u'Main_gene_name', u'Type_of_variant_HumsaVar', u'AA_Change'],
      dtype='object')

In [34]:
combined_varq.shape

(17893, 16)

In [35]:
combined_varq[~combined_varq.Type_of_variant_HumsaVar.isnull()][["Type_of_variant_HumsaVar", "TYPE"]].groupby(["Type_of_variant_HumsaVar", "TYPE"]).size().unstack(fill_value=0)

TYPE,Benign,Pathogenic
Type_of_variant_HumsaVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Disease,0,4675
Polymorphism,1917,66
Unclassified,553,60


In [36]:
combined_varq[~combined_varq.Type_of_variant_ClinVar.isnull()][["Type_of_variant_ClinVar", "TYPE"]].groupby(["Type_of_variant_ClinVar", "TYPE"]).size().unstack(fill_value=0)

TYPE,Benign,Pathogenic
Type_of_variant_ClinVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Affects,0,3
Benign,123,15
Benign;Likely benign,36,3
Benign;Likely benign;Likely pathogenic;Uncertain significance,0,1
Benign;Likely benign;Pathogenic,0,3
Benign;Likely benign;Pathogenic;Uncertain significance,0,1
Benign;Likely benign;Uncertain significance,6,0
Benign;Likely benign;risk factor,0,1
Benign;Pathogenic,0,6
Benign;Pathogenic;Uncertain significance,0,2


In [37]:
unlabeled = combined_varq[combined_varq.Type_of_variant_ClinVar.isnull() & combined_varq.Type_of_variant_HumsaVar.isnull()]

In [38]:
unlabeled.shape # Cantidad de MUTANTS no clasificados

(5863, 16)

In [39]:
combined_varq.MUTANT.duplicated().sum()

24

In [44]:
combined_varq[~combined_varq["Type_of_variant_HumsaVar"].isnull()]["Type_of_variant_HumsaVar"].value_counts()

Disease         4675
Polymorphism    1983
Unclassified     613
Name: Type_of_variant_HumsaVar, dtype: int64

## Ver si los unlabeled aparecen en la tabla variants

### Tabla de Variantes que encontre en el sitio ClinVar

In [3]:
variants = pd.read_csv("/home/marlan/Downloads/variant_summary.txt.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
variants.shape

(668042, 30)

In [163]:
pd.set_option('display.max_colwidth', -1)
variants.head(5)

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,85342440,G,A,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (protein):Q92610#VAR_064583",1
5,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,84799209,G,A,15q25.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (protein):Q92610#VAR_064583",1
6,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606829,...,126145284,C,T,11q24,no assertion criteria provided,1,,N,OMIM Allelic Variant:613622.0001,1
7,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606829,...,126275389,C,T,11q24.2,no assertion criteria provided,1,,N,OMIM Allelic Variant:613622.0001,1
8,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,...,126147412,A,G,11q24,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (protein):Q96CU9#VAR_064571",1


In [5]:
variants.ClinicalSignificance.value_counts()

Uncertain significance                                                             261222
Pathogenic                                                                         116513
Likely benign                                                                      112252
Benign                                                                              57708
Likely pathogenic                                                                   32748
Conflicting interpretations of pathogenicity                                        25643
not provided                                                                        23265
Benign/Likely benign                                                                22342
Pathogenic/Likely pathogenic                                                         6616
other                                                                                4673
-                                                                                    1058
risk facto

In [36]:
variants.columns

Index([u'#AlleleID', u'Type', u'Name', u'GeneID', u'GeneSymbol', u'HGNC_ID',
       u'ClinicalSignificance', u'ClinSigSimple', u'LastEvaluated',
       u'RS# (dbSNP)', u'nsv/esv (dbVar)', u'RCVaccession', u'PhenotypeIDS',
       u'PhenotypeList', u'Origin', u'OriginSimple', u'Assembly',
       u'ChromosomeAccession', u'Chromosome', u'Start', u'Stop',
       u'ReferenceAllele', u'AlternateAllele', u'Cytogenetic', u'ReviewStatus',
       u'NumberSubmitters', u'Guidelines', u'TestedInGTR', u'OtherIDs',
       u'SubmitterCategories'],
      dtype='object')

In [182]:
amino = variants.Name.str.extract('(?P<C1>[A-Z]{1}[a-z]{2})(?P<C2>\d+)(?P<C3>[A-Z]{1}[a-z]{2})', expand=True)
protein = variants.OtherIDs.str.extract('\(protein\):(?P<Protein>[A-Z,\d]+)#', expand=True)
amino["C1"] = df.C1.map(AMINO_CODE)
amino["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(protein.Protein + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [192]:
unlabeled.MUTANT.isin(mutant).sum()

0

### No, no hay ni un unlabeled que podamos clasificar usando variants.

In [194]:
humsavar_GT = combined_varq[~combined_varq["Type of variant HumsaVar"].isnull()]
clinvar_GT = combined_varq[~combined_varq["Type of variant ClinVar"].isnull()]

In [216]:
cols = varq.columns.difference(["TYPE"]).tolist() + ["Type of variant ClinVar"]

In [209]:
clinvar_GT[cols].set_index("MUTANT").to_csv("clinvar_gt.tab.gz", sep="\t", compression="gzip", index=True)

In [213]:
humsavar_GT[cols].set_index("MUTANT").to_csv("humsavar_gt.tab.gz", sep="\t", compression="gzip", index=True)

In [215]:
humsavar_GT[cols].set_index("MUTANT").shape

(7271, 11)

In [217]:
clinvar_GT[cols].set_index("MUTANT").shape

(7590, 11)