In [2]:
import pandas as pd

### Levanto la tabla de la pagina Humsavar sin el prologo y la coda

In [3]:
hum = pd.read_csv("data/humsavar_clean.csv.gz", sep=",")

In [57]:
hum.columns.str.replace(" ", "")

Index([u'Maingenename', u'SwissProtAC', u'FTId', u'AAChange', u'Typeofvariant',
       u'dbSNP', u'DiseaseName'],
      dtype='object')

In [5]:
hum.shape

(76730, 7)

### Los duplicados de la tabla humsaVar varian en su dbSNP y Disease Name unicamente

In [6]:
hum[hum[["Main gene name", "Swiss Prot AC", "AA Change"]].duplicated(keep=False)]

Unnamed: 0,Main gene name,Swiss Prot AC,FTId,AA Change,Type of variant,dbSNP,Disease Name
228,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
229,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Stargardt disease 1 (STGD1) [MIM:248200]
234,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
235,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Stargardt disease 1 (STGD1) [MIM:248200]
240,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
241,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Stargardt disease 1 (STGD1) [MIM:248200]
243,ABCA4,P78363,VAR_008413,p.Glu471Lys,Disease,rs1800548,"Macular degeneration, age-related, 2 (ARMD2) [..."
244,ABCA4,P78363,VAR_008413,p.Glu471Lys,Disease,rs1800548,Stargardt disease 1 (STGD1) [MIM:248200]
246,ABCA4,P78363,VAR_008415,p.Leu541Pro,Disease,rs61751392,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
247,ABCA4,P78363,VAR_008415,p.Leu541Pro,Disease,rs61751392,Fundus flavimaculatus (FFM) [MIM:248200]


In [7]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

Formateo la tabla para coincidir con formato Uniprot-Pos-Amino1-Amino2

In [16]:
df = pd.DataFrame(data=(hum["AA Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss Prot AC"] + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [17]:
hum_final = pd.concat([hum, mutant], 1)[["Main gene name", "Type of variant", "MUTANT", "AA Change"]]

In [18]:
hum_final.shape

(76730, 4)

In [19]:
hum_final.duplicated().sum()

930

In [69]:
hum_final[["Type of variant"]].isnull().sum()

Type of variant    0
dtype: int64

In [23]:
hum_final.drop_duplicates(inplace=True)

In [24]:
hum_final.shape

(75800, 4)

### Levanto tabla clinvar de Santi

In [25]:
clinvar = pd.read_csv("data/mut-clinvar.tab.gz", sep="\t", header=None, names=["MUTANT", "Type of variant"])

In [26]:
clinvar.shape

(7631, 2)

In [27]:
clinvar.head()

Unnamed: 0,MUTANT,Type of variant
0,Q14896-432-A-T,Uncertain significance
1,Q86YC2-862-K-N,Uncertain significance
2,Q07889-663-I-V,Likely benign
3,Q14896-429-A-E,Uncertain significance
4,P12883-204-R-C,Pathogenic


In [30]:
clinvar.drop_duplicates(inplace=True)

In [31]:
clinvar.shape

(7580, 2)

In [70]:
clinvar["Type of variant"].isnull().sum()

0

### Combino la tabla humsavar y clinvar para ver clasificaciones

In [32]:
combined = clinvar.merge(hum_final, right_on="MUTANT", left_on="MUTANT", suffixes=[" ClinVar", " HumsaVar"], how="outer")

In [33]:
combined.head()

Unnamed: 0,MUTANT,Type of variant ClinVar,Main gene name,Type of variant HumsaVar,AA Change
0,Q14896-432-A-T,Uncertain significance,,,
1,Q86YC2-862-K-N,Uncertain significance,,,
2,Q07889-663-I-V,Likely benign,,,
3,Q14896-429-A-E,Uncertain significance,,,
4,P12883-204-R-C,Pathogenic,,,


In [34]:
combined[combined["Type of variant ClinVar"] == "not provided"][["Main gene name","Type of variant HumsaVar"]].isnull().sum()

Main gene name              684
Type of variant HumsaVar    684
dtype: int64

### Tabla de Variantes que encontre en el sitio ClinVar

In [35]:
variants = pd.read_csv("/home/marlan/Downloads/variant_summary.txt.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
variants.columns

Index([u'#AlleleID', u'Type', u'Name', u'GeneID', u'GeneSymbol', u'HGNC_ID',
       u'ClinicalSignificance', u'ClinSigSimple', u'LastEvaluated',
       u'RS# (dbSNP)', u'nsv/esv (dbVar)', u'RCVaccession', u'PhenotypeIDS',
       u'PhenotypeList', u'Origin', u'OriginSimple', u'Assembly',
       u'ChromosomeAccession', u'Chromosome', u'Start', u'Stop',
       u'ReferenceAllele', u'AlternateAllele', u'Cytogenetic', u'ReviewStatus',
       u'NumberSubmitters', u'Guidelines', u'TestedInGTR', u'OtherIDs',
       u'SubmitterCategories'],
      dtype='object')

In [91]:
def remove(l):
    try:
        return len(l) == 2
    except:
        return False
        


[['NM_014630.2(ZNF592):c.3136G>A', '(p.Gly1046Arg)'],
 ['NM_014630.2(ZNF592):c.3136G>A', '(p.Gly1046Arg)'],
 ['NM_017547.3(FOXRED1):c.694C>T', '(p.Gln232Ter)'],
 ['NM_017547.3(FOXRED1):c.694C>T', '(p.Gln232Ter)'],
 ['NM_017547.3(FOXRED1):c.1289A>G', '(p.Asn430Ser)'],
 ['NM_017547.3(FOXRED1):c.1289A>G', '(p.Asn430Ser)'],
 ['NM_025152.2(NUBPL):c.166G>A', '(p.Gly56Arg)'],
 ['NM_025152.2(NUBPL):c.166G>A', '(p.Gly56Arg)'],
 ['NM_000410.3(HFE):c.157G>A', '(p.Val53Met)'],
 ['NM_000410.3(HFE):c.845G>A', '(p.Cys282Tyr)'],
 ['NM_000410.3(HFE):c.845G>A', '(p.Cys282Tyr)'],
 ['NM_000410.3(HFE):c.187C>G', '(p.His63Asp)'],
 ['NM_000410.3(HFE):c.187C>G', '(p.His63Asp)'],
 ['NM_000410.3(HFE):c.193A>T', '(p.Ser65Cys)'],
 ['NM_000410.3(HFE):c.193A>T', '(p.Ser65Cys)'],
 ['NM_000410.3(HFE):c.314T>C', '(p.Ile105Thr)'],
 ['NM_000410.3(HFE):c.314T>C', '(p.Ile105Thr)'],
 ['NM_000410.3(HFE):c.277G>C', '(p.Gly93Arg)'],
 ['NM_000410.3(HFE):c.277G>C', '(p.Gly93Arg)'],
 ['NM_000410.3(HFE):c.157G>A', '(p.Val53Met)']

In [93]:
data = variants.Name.str.split(" ").tolist()
df = pd.DataFrame(data=filter(remove, data), columns=["C1", "C2"])
df.C2

0         (p.Gly1046Arg)
1         (p.Gly1046Arg)
2          (p.Gln232Ter)
3          (p.Gln232Ter)
4          (p.Asn430Ser)
5          (p.Asn430Ser)
6           (p.Gly56Arg)
7           (p.Gly56Arg)
8           (p.Val53Met)
9          (p.Cys282Tyr)
10         (p.Cys282Tyr)
11          (p.His63Asp)
12          (p.His63Asp)
13          (p.Ser65Cys)
14          (p.Ser65Cys)
15         (p.Ile105Thr)
16         (p.Ile105Thr)
17          (p.Gly93Arg)
18          (p.Gly93Arg)
19          (p.Val53Met)
20          (p.Val59Met)
21          (p.Val59Met)
22         (p.Gln127His)
23         (p.Gln127His)
24         (p.Arg330Met)
25         (p.Arg330Met)
26         (p.Gln283Pro)
27         (p.Gln283Pro)
28         (p.Glu626Gly)
29         (p.Glu626Gly)
               ...      
373404     (p.Val194Ala)
373405     (p.Val194Ala)
373406     (p.Leu712Phe)
373407     (p.Leu712Phe)
373408     (p.Leu726Pro)
373409     (p.Leu726Pro)
373410     (p.Ala549Val)
373411     (p.Ala549Val)
373412     (p.Ile406Thr)


In [40]:
variants.Type.value_counts()

single nucleotide variant    533427
deletion                      51099
copy number gain              27951
copy number loss              24496
duplication                   19483
insertion                      5556
indel                          4673
undetermined variant            444
NT expansion                    408
Translocation                   201
inversion                       125
protein only                     99
complex                          51
short repeat                     25
fusion                            4
Name: Type, dtype: int64

In [41]:
variants = variants[variants.Type == "single nucleotide variant"]

In [71]:
variants.ClinicalSignificance.value_counts()

Uncertain significance                                                             226504
Likely benign                                                                       99855
Pathogenic                                                                          62303
Benign                                                                              43962
Conflicting interpretations of pathogenicity                                        24331
Likely pathogenic                                                                   23469
Benign/Likely benign                                                                21205
not provided                                                                        17936
Pathogenic/Likely pathogenic                                                         5435
other                                                                                4551
risk factor                                                                           839
-         

### Merge con tabla VarQ

In [43]:
varq = pd.read_csv("data/properties-varq.tab.gz", sep="\t")

In [44]:
varq.shape

(17869, 12)

In [45]:
combined_varq = varq.merge(combined, left_on="MUTANT", right_on="MUTANT", how="left")

In [46]:
combined_varq.columns

Index([u'MUTANT', u'SASA', u'SASA_PERCENTAGE', u'BFACTOR', u'SWITCHBILITY',
       u'AGGREGABILITY', u'CONSERVATION', u'3DID', u'PDB', u'ACTIVE_SITE',
       u'VARIATION_ENERGY', u'TYPE', u'Type of variant ClinVar',
       u'Main gene name', u'Type of variant HumsaVar', u'AA Change'],
      dtype='object')

In [47]:
combined_varq.shape

(17893, 16)

In [72]:
unlabeled = combined_varq[combined_varq["Type of variant ClinVar"].isnull() & combined_varq["Type of variant HumsaVar"].isnull()]

In [50]:
combined_varq.MUTANT.duplicated().sum()

24

In [66]:
combined_varq[~combined_varq["Type of variant ClinVar"].isnull()]["Type of variant ClinVar"].value_counts()

Uncertain significance                                           2715
Pathogenic                                                       2404
not provided                                                     1029
Likely pathogenic                                                 625
Likely benign                                                     154
Likely pathogenic;Pathogenic                                      149
Benign                                                            138
Pathogenic;Uncertain significance                                  67
Likely pathogenic;Uncertain significance                           64
Likely benign;Uncertain significance                               57
Benign;Likely benign                                               39
risk factor                                                        37
drug response                                                      19
Likely pathogenic;Pathogenic;Uncertain significance                13
Benign;Uncertain sig

In [65]:
combined_varq[~combined_varq["Type of variant HumsaVar"].isnull()]["Type of variant HumsaVar"].value_counts()

Disease         4675
Polymorphism    1983
Unclassified     613
Name: Type of variant HumsaVar, dtype: int64

## Ver si los unlabeled aparecen en la tabla variants

In [None]:
unlabeled[unlabeled.MUTANT.isin]