In [1]:
import pandas as pd

### Levanto la tabla de la pagina Humsavar sin el prologo y la coda

In [2]:
hum = pd.read_csv("../data/interim/humsavar_clean_201711.csv.gz", sep=",")

In [3]:
hum.columns = hum.columns.str.replace(" ", "_")

In [4]:
hum.shape

(76730, 7)

In [5]:
hum.Type_of_variant.value_counts()

Polymorphism    39655
Disease         29726
Unclassified     7349
Name: Type_of_variant, dtype: int64

In [6]:
hum.columns

Index(['Main_gene_name', 'Swiss_Prot_AC', 'FTId', 'AA_Change',
       'Type_of_variant', 'dbSNP', 'Disease_Name'],
      dtype='object')

### Los duplicados de la tabla humsaVar varian solo en su Disease Name

In [7]:
hum[hum[["Main_gene_name", "Swiss_Prot_AC", "AA_Change"]].duplicated(keep=False)].head()

Unnamed: 0,Main_gene_name,Swiss_Prot_AC,FTId,AA_Change,Type_of_variant,dbSNP,Disease_Name
228,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
229,ABCA4,P78363,VAR_008401,p.Gly65Glu,Disease,rs62654395,Stargardt disease 1 (STGD1) [MIM:248200]
234,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Cone-rod dystrophy 3 (CORD3) [MIM:604116]
235,ABCA4,P78363,VAR_008406,p.Arg212Cys,Disease,rs61750200,Stargardt disease 1 (STGD1) [MIM:248200]
240,ABCA4,P78363,VAR_008411,p.Ala407Val,Disease,rs61751264,Cone-rod dystrophy 3 (CORD3) [MIM:604116]


In [8]:
#Correspondencia hecha por Santi
AMINO_CODE = {"Ala": "A",\
              "Arg": "R",\
              "Asn": "N",\
              "Asp": "D",\
              "Cys": "C",\
              "Gln": "Q",\
              "Glu": "E",\
              "Gly": "G",\
              "His": "H",\
              "Ile": "I",\
              "Leu": "L",\
              "Lys": "K",\
              "Met": "M",\
              "Phe": "F",\
              "Pro": "P",\
              "Ser": "S",\
              "Thr": "T",\
              "Trp": "W",\
              "Tyr": "Y",\
              "Val": "V",\
              "Sec": "U"
             }

Formateo la tabla para coincidir con formato Uniprot-Pos-Amino1-Amino2

In [9]:
hum.head()

Unnamed: 0,Main_gene_name,Swiss_Prot_AC,FTId,AA_Change,Type_of_variant,dbSNP,Disease_Name
0,A1BG,P04217,VAR_018369,p.His52Arg,Polymorphism,rs893184,
1,A1BG,P04217,VAR_018370,p.His395Arg,Polymorphism,rs2241788,
2,A1CF,Q9NQ94,VAR_052201,p.Val555Met,Polymorphism,rs9073,
3,A1CF,Q9NQ94,VAR_059821,p.Ala558Ser,Polymorphism,rs11817448,
4,A2ML1,A8K2U0,VAR_055463,p.Gly207Arg,Polymorphism,rs11047499,


In [10]:
df = pd.DataFrame(data=(hum["AA_Change"].str[2:].str.split(r"([0-9]+)")).tolist(), columns=["C1", "C2", "C3"])
df["C1"] = df.C1.map(AMINO_CODE)
df["C3"] = df.C3.map(AMINO_CODE)
mutant = pd.Series(data=(hum["Swiss_Prot_AC"] + "-" + df.C2 + "-" + df.C1 + "-" + df.C3).tolist(), name="MUTANT")

In [11]:
hum_final = pd.concat([hum, mutant], 1)[["Main_gene_name", "Type_of_variant", "MUTANT", "AA_Change", "dbSNP"]]

In [34]:
hum_final.head()

Unnamed: 0,Main_gene_name,Type_of_variant,MUTANT,AA_Change,dbSNP
0,A1BG,Polymorphism,P04217-52-H-R,p.His52Arg,rs893184
1,A1BG,Polymorphism,P04217-395-H-R,p.His395Arg,rs2241788
2,A1CF,Polymorphism,Q9NQ94-555-V-M,p.Val555Met,rs9073
3,A1CF,Polymorphism,Q9NQ94-558-A-S,p.Ala558Ser,rs11817448
4,A2ML1,Polymorphism,A8K2U0-207-G-R,p.Gly207Arg,rs11047499


In [13]:
hum_final.shape

(76730, 5)

In [12]:
for e, group in enumerate(hum_final[hum_final.duplicated(keep=False)].groupby("MUTANT")["dbSNP"]):
    if group[1].value_counts().shape[0] > 1:
        print(group[1].value_counts())
        print("ERROR!")

In [13]:
hum_final.Type_of_variant.isnull().sum()

0

In [14]:
hum_final.drop_duplicates(inplace=True)

In [15]:
hum_final.shape

(75802, 5)

### Levanto tabla clinvar de Santi

In [16]:
clinvar = pd.read_csv("../data/interim/mut-clinvar.tab.gz", sep="\t", header=None, names=["MUTANT", "Type_of_variant"])

In [17]:
clinvar.shape

(7631, 2)

In [18]:
clinvar.head()

Unnamed: 0,MUTANT,Type_of_variant
0,Q14896-432-A-T,Uncertain significance
1,Q86YC2-862-K-N,Uncertain significance
2,Q07889-663-I-V,Likely benign
3,Q14896-429-A-E,Uncertain significance
4,P12883-204-R-C,Pathogenic


In [19]:
clinvar.Type_of_variant.value_counts().head()

Uncertain significance    2722
Pathogenic                2435
not provided              1029
Likely pathogenic          627
Likely benign              154
Name: Type_of_variant, dtype: int64

In [20]:
clinvar.drop_duplicates(inplace=True)

In [21]:
clinvar.shape

(7580, 2)

In [22]:
clinvar.Type_of_variant.isnull().sum()

0

### Combino la tabla humsavar y clinvar para ver clasificaciones

In [23]:
combined = clinvar.merge(hum_final, right_on="MUTANT", left_on="MUTANT", suffixes=["_ClinVar", "_HumsaVar"], how="outer")

In [24]:
combined.head()

Unnamed: 0,MUTANT,Type_of_variant_ClinVar,Main_gene_name,Type_of_variant_HumsaVar,AA_Change,dbSNP
0,Q14896-432-A-T,Uncertain significance,,,,
1,Q86YC2-862-K-N,Uncertain significance,,,,
2,Q07889-663-I-V,Likely benign,,,,
3,Q14896-429-A-E,Uncertain significance,,,,
4,P12883-204-R-C,Pathogenic,,,,


#### Interseccion

In [25]:
combined[~combined.Type_of_variant_ClinVar.isnull() & ~combined.Type_of_variant_HumsaVar.isnull()].shape

(2831, 6)

In [26]:
combined[combined.Type_of_variant_ClinVar == "not provided"][["Main_gene_name","Type_of_variant_HumsaVar"]].isnull().sum()

Main_gene_name              684
Type_of_variant_HumsaVar    684
dtype: int64

### Merge con tabla VarQ

In [27]:
varq = pd.read_csv("../data/processed/properties-varq.tab.gz", sep="\t")

In [28]:
varq.shape

(17869, 12)

In [29]:
combined_varq = varq.merge(combined, left_on="MUTANT", right_on="MUTANT", how="left")

In [30]:
combined_varq.columns

Index(['MUTANT', 'SASA', 'SASA_PERCENTAGE', 'BFACTOR', 'SWITCHBILITY',
       'AGGREGABILITY', 'CONSERVATION', '3DID', 'PDB', 'ACTIVE_SITE',
       'VARIATION_ENERGY', 'TYPE', 'Type_of_variant_ClinVar', 'Main_gene_name',
       'Type_of_variant_HumsaVar', 'AA_Change', 'dbSNP'],
      dtype='object')

In [31]:
combined_varq.shape

(17893, 17)

In [32]:
combined_varq[~combined_varq.Type_of_variant_HumsaVar.isnull()][["Type_of_variant_HumsaVar", "TYPE"]].groupby(["Type_of_variant_HumsaVar", "TYPE"]).size().unstack(fill_value=0)

TYPE,Benign,Pathogenic
Type_of_variant_HumsaVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Disease,0,4675
Polymorphism,1917,66
Unclassified,553,60


In [33]:
combined_varq[~combined_varq.Type_of_variant_ClinVar.isnull()][["Type_of_variant_ClinVar", "TYPE"]].groupby(["Type_of_variant_ClinVar", "TYPE"]).size().unstack(fill_value=0).head()

TYPE,Benign,Pathogenic
Type_of_variant_ClinVar,Unnamed: 1_level_1,Unnamed: 2_level_1
Affects,0,3
Benign,123,15
Benign;Likely benign,36,3
Benign;Likely benign;Likely pathogenic;Uncertain significance,0,1
Benign;Likely benign;Pathogenic,0,3


In [34]:
unlabeled = combined_varq[combined_varq.Type_of_variant_ClinVar.isnull() & combined_varq.Type_of_variant_HumsaVar.isnull()]

In [35]:
unlabeled.shape # Cantidad de MUTANTS no clasificados

(5863, 17)

In [36]:
combined_varq.MUTANT.duplicated().sum()

24

In [37]:
combined_varq[~combined_varq["Type_of_variant_HumsaVar"].isnull()]["Type_of_variant_HumsaVar"].value_counts()

Disease         4675
Polymorphism    1983
Unclassified     613
Name: Type_of_variant_HumsaVar, dtype: int64

## Ver si los unlabeled aparecen en la tabla variants

### Tabla de Variantes que encontre en el sitio ClinVar

In [3]:
variants = pd.read_csv("/home/marlan/Downloads/variant_summary.txt.gz", sep="\t")

  interactivity=interactivity, compiler=compiler, result=result)


In [10]:
variants = variants[variants.Type == "single nucleotide variant"]

In [45]:
pd.set_option('display.max_colwidth', -1)
variants.head(25)

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),...,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
4,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,85342440,G,A,15q25,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (protein):Q92610#VAR_064583",1
5,15043,single nucleotide variant,NM_014630.2(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0,"Jun 29, 2015",150829393,...,84799209,G,A,15q25.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613624.0001,UniProtKB (protein):Q92610#VAR_064583",1
6,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606829,...,126145284,C,T,11q24,no assertion criteria provided,1,,N,OMIM Allelic Variant:613622.0001,1
7,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606829,...,126275389,C,T,11q24.2,no assertion criteria provided,1,,N,OMIM Allelic Variant:613622.0001,1
8,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,...,126147412,A,G,11q24,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (protein):Q96CU9#VAR_064571",1
9,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,...,126277517,A,G,11q24.2,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (protein):Q96CU9#VAR_064571",1
10,15046,single nucleotide variant,NM_025152.2(NUBPL):c.166G>A (p.Gly56Arg),80224,NUBPL,HGNC:20278,Uncertain significance,0,"Feb 22, 2017",200401432,...,32031331,G,A,14q12,"criteria provided, single submitter",3,,N,"OMIM Allelic Variant:613621.0001,UniProtKB (protein):Q8TB37#VAR_064570",3
11,15046,single nucleotide variant,NM_025152.2(NUBPL):c.166G>A (p.Gly56Arg),80224,NUBPL,HGNC:20278,Uncertain significance,0,"Feb 22, 2017",200401432,...,31562125,G,A,14q12,"criteria provided, single submitter",3,,N,"OMIM Allelic Variant:613621.0001,UniProtKB (protein):Q8TB37#VAR_064570",3
15,15053,single nucleotide variant,NM_000410.3(HFE):c.892+48G>A,3077,HFE,HGNC:4886,Benign,0,"Nov 01, 1999",1800758,...,26093236,G,A,6p22.2,no assertion criteria provided,1,,N,OMIM Allelic Variant:613609.0004,1
16,15053,single nucleotide variant,NM_000410.3(HFE):c.892+48G>A,3077,HFE,HGNC:4886,Benign,0,"Nov 01, 1999",1800758,...,26093008,G,A,6p22.2,no assertion criteria provided,1,,N,OMIM Allelic Variant:613609.0004,1


In [12]:
((variants["RS# (dbSNP)"] == -1) & (variants["Type"] == "single nucleotide variant")).sum()

7568

In [13]:
variants.shape

(533427, 30)

In [14]:
variants.ClinicalSignificance.value_counts().head()

Uncertain significance                          226504
Likely benign                                   99855 
Pathogenic                                      62303 
Benign                                          43962 
Conflicting interpretations of pathogenicity    24331 
Name: ClinicalSignificance, dtype: int64

In [15]:
variants.columns

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start', 'Stop', 'ReferenceAllele', 'AlternateAllele',
       'Cytogenetic', 'ReviewStatus', 'NumberSubmitters', 'Guidelines',
       'TestedInGTR', 'OtherIDs', 'SubmitterCategories'],
      dtype='object')

In [41]:
amino = variants.Name.str.extract('(?P<C1>[A-Z]{1}[a-z]{2})(?P<C2>\d+)(?P<C3>[A-Z]{1}[a-z]{2})', expand=True)
protein = variants.OtherIDs.str.extract('\(protein\):(?P<Protein>[A-Z,\d]+)#', expand=True)
amino["C1"] = amino.C1.map(AMINO_CODE)
amino["C3"] = amino.C3.map(AMINO_CODE)
mutant = pd.Series(data=(protein.Protein + "-" + amino.C2 + "-" + amino.C1 + "-" + amino.C3).tolist(), name="MUTANT")

In [42]:
mutant = mutant[~mutant.isnull()]

In [43]:
mutant.shape

(34519,)

In [39]:
hum_final.MUTANT.drop_duplicates().shape

(75769,)

In [44]:
hum_final.MUTANT.isin(mutant).sum()

17866

### No, no hay ni un unlabeled que podamos clasificar usando variants.

## Genero Ground Truth para ClinVar y para HumsaVar

In [38]:
humsavar_GT = combined_varq[~combined_varq["Type_of_variant_HumsaVar"].isnull()]
clinvar_GT = combined_varq[~combined_varq["Type_of_variant_ClinVar"].isnull()]

In [39]:
cols = varq.columns.difference(["TYPE"]).tolist() + ["Type_of_variant_HumsaVar", "dbSNP"] 

In [40]:
varq.columns

Index(['MUTANT', 'SASA', 'SASA_PERCENTAGE', 'BFACTOR', 'SWITCHBILITY',
       'AGGREGABILITY', 'CONSERVATION', '3DID', 'PDB', 'ACTIVE_SITE',
       'VARIATION_ENERGY', 'TYPE'],
      dtype='object')

In [41]:
humsavar_GT = humsavar_GT[cols]
humsavar_GT.rename(columns={"Type_of_variant_HumsaVar": "TYPE"}, inplace=True)

In [42]:
clinvar_GT[cols].set_index("MUTANT").to_csv("../data/processed/clinvar_gt.tab.gz", sep="\t", compression="gzip", index=True)

In [43]:
humsavar_GT.set_index("MUTANT").to_csv("../data/processed/humsavar_varq_gt.csv.gz", compression="gzip", index=True)

In [44]:
humsavar_GT.set_index("MUTANT").shape

(7271, 12)

In [51]:
clinvar_GT[cols].set_index("MUTANT").shape

(7590, 11)