# Summary

Load data the abdellah_2009 study.

Only $\Delta T_{m}$ is reported, and it is hard to convert this to $\Delta \Delta G$.

# Imports

In [1]:
NOTEBOOK_NAME = 'process_small_studies_data'

In [3]:
%run imports.ipynb

protein_folding_energy
2016-05-10 01:22:13.556520


# Load data

In [6]:
!ls ../downloads

abdellah_2009.tsv.gz  kineticdb.html  ProTherm.dat.gz	 taipale
kineticdb2.html       pfd_all.html    rosetta_ddg.zip
kineticdb_all.html    pfd.html	      small_studies.ods


In [7]:
# Load required data from text files and the database
abdellah_et_al = pd.read_csv('../downloads/abdellah_2009.tsv.gz', sep='\t')
abdellah_et_al['remarks'] = 'Allali-Hassani et.al. Biochem. J. 2009'
abdellah_et_al['gene_name'] = abdellah_et_al['Protein'].apply(lambda x: x.split('_')[0].lower())
abdellah_et_al_mut = abdellah_et_al.dropna(subset=['dbSNP']).copy()
abdellah_et_al_mut['uniprot_mutation'] = abdellah_et_al_mut['Protein'].apply(lambda x: x.split('_')[-1].upper())
abdellah_et_al_mut['dTm_median'] = abdellah_et_al_mut['ΔTagg']

In [8]:
abdellah_et_al_mut.head()

Unnamed: 0,Protein,dbSNP,Highest allele frequency,Tagg,ΔTagg,Structural mapping,SNPs3D Sequence analysis,SNPs3D Structural analysis,remarks,gene_name,uniprot_mutation,dTm_median
1,INMT_F254C,rs4720015,0.922,44.8±0.1,−5.4,Buried,−1.82,−1.02,Allali-Hassani et.al. Biochem. J. 2009,inmt,F254C,−5.4
3,HRMT1L3_L440V,rs3758805,0.136,51.6±0.1,−3.2,Buried,−0.48,0.95,Allali-Hassani et.al. Biochem. J. 2009,hrmt1l3,L440V,−3.2
4,HRMT1L3_N508S,rs6483700,0.178,54.6±0.2,−0.2,Exposed,0.99,0.4,Allali-Hassani et.al. Biochem. J. 2009,hrmt1l3,N508S,−0.2
5,HRMT1L3_S470C,rs11025585,<0.003,49.0±0.2,−5.8,Exposed,−2.36,0.91,Allali-Hassani et.al. Biochem. J. 2009,hrmt1l3,S470C,−5.8
7,SRM_L149V,rs1049932,<0.003,56.4±0.2,2.6,Buried,1.3,0.76,Allali-Hassani et.al. Biochem. J. 2009,srm,L149V,2.6


In [9]:
#
engine = sa.create_engine('mysql://elaspic:elaspic@192.168.6.19/')
sql_query = """
select *
from mutation.ensembl_76_missense_variants_all_scores
join uniprot_kb.uniprot_sequence using (uniprot_id)
where variation_name in ('{}') ;
""".format("', '".join(abdellah_et_al_mut['dbSNP'].values))
variations = pd.read_sql_query(sql_query, engine)
del variations['gene_name']
del variations['uniprot_mutation']

sql_query = """
select *
from uniprot_kb.uniprot_identifier
join uniprot_kb.uniprot_sequence using (uniprot_id)
where identifier_id in ('{0}')
and db = 'sp' ;
""".format("', '".join(set(abdellah_et_al_mut['gene_name'].values)))
sequences_1 = pd.read_sql_query(sql_query, engine)
sequences_1['gene_name'] = sequences_1['identifier_id'].str.lower()

sql_query = """
select *
from uniprot_kb.uniprot_sequence
where gene_name in ('{0}')
and db = 'sp' ;
""".format("', '".join(set(abdellah_et_al_mut['gene_name'].values)))
sequences_2 = pd.read_sql_query(sql_query, engine)
sequences_2['gene_name'] = sequences_2['gene_name'].str.lower()

In [15]:
# Map data to uniprot id and mutation
abdellah_et_al_up_mut = pd.concat([
        abdellah_et_al_mut.merge(variations, left_on=['dbSNP'], right_on=['variation_name']),
        abdellah_et_al_mut.merge(sequences_1, on='gene_name'),
        abdellah_et_al_mut.merge(sequences_2, on='gene_name'),
    ], ignore_index=True)

abdellah_et_al_up_mut['mutation_in_sequence'] = [
    ascommon.sequence_tools.mutation_matches_sequence(*x) for x in
    abdellah_et_al_up_mut[['uniprot_mutation', 'uniprot_sequence']].values]
abdellah_et_al_up_mut['is_splice_variant'] = abdellah_et_al_up_mut['uniprot_id'].str.contains('-')
abdellah_et_al_up_mut = abdellah_et_al_up_mut[
    (abdellah_et_al_up_mut['mutation_in_sequence']) &
    (~abdellah_et_al_up_mut['is_splice_variant'])]
abdellah_et_al_up_mut = abdellah_et_al_up_mut.drop_duplicates(subset=['uniprot_id', 'uniprot_mutation'])



In [16]:
abdellah_et_al_up_mut.head()

Unnamed: 0,Highest allele frequency,Protein,SNPs3D Sequence analysis,SNPs3D Structural analysis,Structural mapping,Tagg,dTm_median,db,dbSNP,ensp_id,ensp_mutation,f_condel_score,f_fathmm_score,f_ma_score,f_pph2_score,f_sift_score,gene_name,id,identifier_id,identifier_type,ma_ma_prediction,ma_ma_score,organism_name,p_provean_prediction,p_provean_score,p_sift_prediction,p_sift_score,protein_existence,protein_name,remarks,sequence_version,transcript_variation_id,tv_pph2_prediction,tv_pph2_score,tv_sift_prediction,tv_sift_score,uniprot_id,uniprot_mutation,uniprot_name,uniprot_sequence,variant,variation_feature_id,variation_name,ΔTagg,mutation_in_sequence,is_splice_variant
0,0.922,INMT_F254C,−1.82,−1.02,Buried,44.8±0.1,−5.4,sp,rs4720015,ENSP00000013222,F254C,0.393354,3.75,2.595,0.995,0.09,inmt,,,,medium,2.595,Homo sapiens,Deleterious,-4.998,Tolerated,0.061,1,Indolethylamine N-methyltransferase,Allali-Hassani et.al. Biochem. J. 2009,3,144233665,probably damaging,0.965,deleterious,0.02,O95050,F254C,INMT_HUMAN,MKGGFTGGDEYQKHFLPRDYLATYYSFDGSPSPEAEMLKFNLECLH...,"7,30755820,T,G",3840839,rs4720015,−5.4,True,False
2,0.136,HRMT1L3_L440V,−0.48,0.95,Buried,51.6±0.1,−3.2,sp,rs3758805,ENSP00000331879,L440V,0.603823,-2.97,2.51,0.676,0.1,hrmt1l3,,,,medium,2.51,Homo sapiens,Deleterious,-2.522,Damaging,0.013,1,Protein arginine N-methyltransferase 3,Allali-Hassani et.al. Biochem. J. 2009,3,20750086,probably damaging,0.968,deleterious,0.0,O60678,L440V,ANM3_HUMAN,MCSLASGATGGRGAVENEEDLPELSDSGDEAAWEDEDDADLPHGKQ...,"11,20464517,C,G",2798596,rs3758805,−3.2,True,False
4,<0.003,HRMT1L3_S470C,−2.36,0.91,Exposed,49.0±0.2,−5.8,sp,rs11025585,ENSP00000331879,S470C,0.707102,-2.22,3.435,0.999,0.0,hrmt1l3,,,,medium,3.435,Homo sapiens,Deleterious,-4.298,Damaging,0.001,1,Protein arginine N-methyltransferase 3,Allali-Hassani et.al. Biochem. J. 2009,3,20775357,probably damaging,1.0,deleterious,0.0,O60678,S470C,ANM3_HUMAN,MCSLASGATGGRGAVENEEDLPELSDSGDEAAWEDEDDADLPHGKQ...,"11,20494177,C,G",7411378,rs11025585,−5.8,True,False
6,<0.003,SRM_L149V,1.3,0.76,Buried,56.4±0.2,2.6,sp,rs1049932,ENSP00000366156,L149V,0.422192,-0.91,-0.37,0.054,1.0,srm,,,,neutral,-0.37,Homo sapiens,Neutral,0.205,Tolerated,1.0,1,Spermidine synthase,Allali-Hassani et.al. Biochem. J. 2009,1,66009244,benign,0.065,tolerated,1.0,P19623,L149V,SPEE_HUMAN,MEPGPDGPAASGPAAIREGWFRETCSLWPGQALSLQVEQLLHHRRS...,"1,11056694,G,C",804889,rs1049932,2.6,True,False
10,0.058,SIRT5_F285L,−0.1,−0.99,Buried flexible loop,53.1±0.1,1.3,sp,rs9464003,ENSP00000476228,F285L,,,,,,sirt5,,,,low,1.635,Homo sapiens,,,,,1,"NAD-dependent protein deacylase sirtuin-5, mit...",Allali-Hassani et.al. Biochem. J. 2009,2,146616898,benign,0.088,tolerated,0.17,Q9NXA8,F285L,SIR5_HUMAN,MRPLQIVPSRLISQLYCGLKPPASTRNQICLKMARPSSSMADFRKF...,"6,13600947,C,A",6138633,rs9464003,1.3,True,False
