# Summary

Link to paper: http://www.sciencedirect.com/science/article/pii/S0092867415004304

----

# Imports

In [1]:
%run imports.ipynb

2016-07-27 03:47:13.924315


In [2]:
NOTEBOOK_NAME = 'taipale'
os.makedirs(NOTEBOOK_NAME, exist_ok=True)

os.environ['NOTEBOOK_NAME'] = NOTEBOOK_NAME
os.environ['DB_PORT'] = '8307'

In [3]:
%run mysqld.ipynb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2016-07-27 03:47:14.039881


In [4]:
db_remote = datapkg.MySQL(
    connection_string=os.environ['DATAPKG_CONNECTION_STRING'] + '/staging', 
    shared_folder=os.environ['NOTEBOOK_NAME'], 
    storage_host=None, 
    echo=False, 
    db_engine='InnoDB'
)

# Load mutation data

In [5]:
ls ../downloads/taipale/

1-s2.0-S0092867415004304-main.pdf  mmc2.xlsx  mmc4.xlsx  mmc6.xlsx  mmc8.pdf
mmc1.xlsx                          mmc3.xlsx  mmc5.xlsx  mmc7.xlsx


In [6]:
mutation_df = pd.read_excel('../downloads/taipale/mmc1.xlsx')

In [7]:
display(mutation_df.head(2))
print(mutation_df.shape[0])

Unnamed: 0,Category,Symbol,Entrez_Gene_ID,Allele_ID,Mutation_RefSeq_NT,Mutation_RefSeq_AA,HGMD_accession,HGMD_variant_class,dbSNP_ID,Disease
0,Disease mutation,A2M,2,2_18118,NM_000014:c.2915G>A,NP_000005:p.C972Y,CM920001,DM,rs1800433,Chronic obstructive pulmonary disease
1,Disease mutation,A2M,2,2_18119,NM_000014:c.2998G>A,NP_000005:p.V1000I,CM980001,DP,rs669,"Alzheimer disease, association with"


2960


### mutation_df_1

In [8]:
mutation_df['refseq_id'] = mutation_df['Mutation_RefSeq_AA'].apply(lambda x: x.split(':')[0])
mutation_df['refseq_mutation'] = mutation_df['Mutation_RefSeq_AA'].apply(lambda x: x.split('.')[-1])

In [9]:
display(mutation_df.head(2))
print("Number of rows:", 
    mutation_df.shape[0])
print("Number of unique RefSeq nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])
print("Number of unique RefSeq amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA']).shape[0])

Unnamed: 0,Category,Symbol,Entrez_Gene_ID,Allele_ID,Mutation_RefSeq_NT,Mutation_RefSeq_AA,HGMD_accession,HGMD_variant_class,dbSNP_ID,Disease,refseq_id,refseq_mutation
0,Disease mutation,A2M,2,2_18118,NM_000014:c.2915G>A,NP_000005:p.C972Y,CM920001,DM,rs1800433,Chronic obstructive pulmonary disease,NP_000005,C972Y
1,Disease mutation,A2M,2,2_18119,NM_000014:c.2998G>A,NP_000005:p.V1000I,CM980001,DP,rs669,"Alzheimer disease, association with",NP_000005,V1000I


Number of rows: 2960
Number of unique RefSeq nucleotide mutations: 2960
Number of unique RefSeq amino acid mutations: 2958


In [10]:
print("Duplicated RefSeq amino acid mutations:")
duplicates = mutation_df[mutation_df['Mutation_RefSeq_AA'].duplicated()]['Mutation_RefSeq_AA']
mutation_df[mutation_df['Mutation_RefSeq_AA'].isin(duplicates)]

Duplicated RefSeq amino acid mutations:


Unnamed: 0,Category,Symbol,Entrez_Gene_ID,Allele_ID,Mutation_RefSeq_NT,Mutation_RefSeq_AA,HGMD_accession,HGMD_variant_class,dbSNP_ID,Disease,refseq_id,refseq_mutation
1404,Disease mutation,HSPB8,26353,26353_7864,NM_014365:c.423G>C,NP_055180:p.K141N,CM041377,DM,rs104894345,"Neuropathy, distal hereditary motor, type II",NP_055180,K141N
1405,Disease mutation,HSPB8,26353,26353_7865,NM_014365:c.423G>T,NP_055180:p.K141N,CM050270,DM,rs104894345,Charcot-Marie-Tooth disease 2L,NP_055180,K141N
1875,Disease mutation,NIPA1,123606,123606_15185,NM_144599:c.316G>A,NP_653200:p.G106R,CM050748,DM,rs104894490,"Spastic paraplegia, autosomal dominant",NP_653200,G106R
1876,Disease mutation,NIPA1,123606,123606_15186,NM_144599:c.316G>C,NP_653200:p.G106R,CM050749,DM,rs104894490,"Spastic paraplegia, autosomal dominant",NP_653200,G106R


In [11]:
mutation_df_1 = mutation_df.copy()

## NCBI to UniProt

Map NCBI to UniProt.

### mutation_df_2

Use `uniparc2uniparc_canonical_mapping` table to map Refseq to Uniprot.

In [12]:
mutation_df = mutation_df_1.copy()

In [13]:
sql_query = """\
select 
x.id refseq_id, 
uniparc_id
from uniparc_human.uniparc_xref x 
where x.type = 'refseq' and x.id in ('{}')
""".format("', '".join(r for r in mutation_df['refseq_id'].drop_duplicates()))

In [14]:
refseq2uniparc = pd.read_sql_query(sql_query, db_remote.engine).drop_duplicates()

In [15]:
mutation_df = (
    mutation_df
    .merge(refseq2uniparc, on=['refseq_id'], how='left')
)

In [16]:
mutation_df['refseq_mutation_pos'] = mutation_df['refseq_mutation'].str[1:-1].astype(int)

In [17]:
t = db_remote.import_df(
    mutation_df[['refseq_id', 'uniparc_id', 'refseq_mutation_pos']].dropna().drop_duplicates(),
    'taipale'
)

db_remote.create_indexes(
    t.name,
    [
        (['refseq_id', 'uniparc_id'], False),
        (['uniparc_id', 'refseq_id'], False),
    ],
)

In [18]:
sql_query = """\
SELECT
t.uniparc_id,
t.refseq_id,
t.refseq_mutation_pos,
us.uniprot_id,
us.uniprot_sequence,
FIND_IN_SET(refseq_mutation_pos, a2b) uniprot_mutation_pos
FROM staging.taipale t
JOIN uniparc_human.uniparc2uniparc_canonical_mapping m USING (uniparc_id)
JOIN uniparc_human.uniparc_xref x2 ON (
    x2.type = 'UniProtKB/Swiss-Prot' AND x2.uniparc_id = m.uniparc_canonical_id)
JOIN uniprot_kb.uniprot_sequence us ON (us.db = 'sp' and us.uniprot_id = x2.id);\
"""
print(sql_query)

SELECT
t.uniparc_id,
t.refseq_id,
t.refseq_mutation_pos,
us.uniprot_id,
us.uniprot_sequence,
FIND_IN_SET(refseq_mutation_pos, a2b) uniprot_mutation_pos
FROM staging.taipale t
JOIN uniparc_human.uniparc2uniparc_canonical_mapping m USING (uniparc_id)
JOIN uniparc_human.uniparc_xref x2 ON (
    x2.type = 'UniProtKB/Swiss-Prot' AND x2.uniparc_id = m.uniparc_canonical_id)
JOIN uniprot_kb.uniprot_sequence us ON (us.db = 'sp' and us.uniprot_id = x2.id);


In [19]:
refseq2uniprot = pd.read_sql_query(sql_query, db_remote.engine).drop_duplicates()

In [20]:
refseq2uniprot.head(2)

Unnamed: 0,uniparc_id,refseq_id,refseq_mutation_pos,uniprot_id,uniprot_sequence,uniprot_mutation_pos
0,UPI000006E60E,NP_005661,279,B3EWF7,MHPKEGAEQHVFSPVPGAPTPPPNRCGRLVLGPRLPAAGTPGPGIR...,0
1,UPI000006E60E,NP_005661,294,B3EWF7,MHPKEGAEQHVFSPVPGAPTPPPNRCGRLVLGPRLPAAGTPGPGIR...,0


In [21]:
mutation_df = (
    mutation_df.merge(refseq2uniprot, on=['uniparc_id', 'refseq_id', 'refseq_mutation_pos'], how='left')
)

In [22]:
mutation_df['uniprot_mutation'] = (
    mutation_df['refseq_mutation'].str[0] + 
    mutation_df['uniprot_mutation_pos'].apply(lambda x: str(int(x)) if pd.notnull(x) else np.nan) + 
    mutation_df['refseq_mutation'].str[-1]
)

In [23]:
print("Number of rows:", 
    mutation_df.shape[0])
print("Number of unique RefSeq nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])
print("Number of unique RefSeq amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA']).shape[0], "<--")
print("Number of unique RefSeq / Uniprot nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT', 'uniprot_id']).shape[0])
print("Number of unique RefSeq / Uniprot amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA', 'uniprot_id']).shape[0])

Number of rows: 3853
Number of unique RefSeq nucleotide mutations: 2960
Number of unique RefSeq amino acid mutations: 2958 <--
Number of unique RefSeq / Uniprot nucleotide mutations: 3662
Number of unique RefSeq / Uniprot amino acid mutations: 3660


In [24]:
mutation_df_2 = mutation_df.copy()

### mutation_df_failed

In [25]:
assert mutation_df['Mutation_RefSeq_AA'].drop_duplicates().shape[0] == 2958

In [26]:
mutation_df_failed = (
    mutation_df[['Mutation_RefSeq_AA', 'uniparc_id', 'uniprot_id', 'uniprot_mutation']].copy()
)

In [27]:
assert not set(mutation_df_failed['Mutation_RefSeq_AA']) ^ set(mutation_df_1['Mutation_RefSeq_AA'])

In [28]:
mutation_df_failed['unique_id'] = (
    mutation_df_failed['Mutation_RefSeq_AA'] + '.' + mutation_df_failed['uniparc_id'] + '.' + 
    mutation_df_failed['uniprot_id']
)

In [29]:
mutation_df_failed['errors'] = ''

In [30]:
mutation_df_failed.loc[mutation_df_failed['uniprot_id'].isnull(), 'errors'] = (
    "Could not map UniParc to UniProt. "
)
mutation_df_failed.loc[mutation_df_failed['uniparc_id'].isnull(), 'errors'] = (
    "Could not map RefSeq to UniParc. "
)
mutation_df_failed.loc[
    mutation_df_failed['Mutation_RefSeq_AA']
        .apply(lambda x: x.split(':')[-1].split('.')[-1][0] == x.split(':')[-1].split('.')[-1][-1]),
    'errors'] = (
    "Missense mutation. "
)

In [31]:
mutation_df_failed[mutation_df_failed['errors'] != ''].tail()

Unnamed: 0,Mutation_RefSeq_AA,uniparc_id,uniprot_id,uniprot_mutation,unique_id,errors
3841,NP_008896:p.S220N,UPI0000161F67,,,,Could not map UniParc to UniProt.
3842,NP_067645:p.S472P,UPI000006F57B,,,,Could not map UniParc to UniProt.
3844,NP_067645:p.R501S,UPI000006F57B,,,,Could not map UniParc to UniProt.
3846,NP_067645:p.V524I,UPI000006F57B,,,,Could not map UniParc to UniProt.
3851,NP_065838:p.K135N,UPI000035E7B3,,,,Could not map UniParc to UniProt.


In [32]:
mutation_df_failed.shape

(3853, 6)

## Mutation matches sequence

Make sure that the AA in UniProt sequence matches the AA in the RefSeq mutation.

### mutation_df_3

In [33]:
mutation_df = mutation_df_2.copy()

In [34]:
(mutation_df['refseq_mutation'].str[0] == mutation_df['refseq_mutation'].str[-1]).sum()

48

In [35]:
mutation_df = (
    mutation_df[
        (mutation_df['refseq_mutation'].str[0] != mutation_df['refseq_mutation'].str[-1])
    ]
)

In [36]:
mutation_df['mutation_matches_sequence'] = (
    mutation_df[['uniprot_mutation', 'uniprot_sequence']]
    .apply(lambda x: kmtools.sequence_tools.mutation_matches_sequence(*x), axis=1)
)

In [37]:
(mutation_df['mutation_matches_sequence'] != True).sum()

837

In [38]:
mutation_df = mutation_df[mutation_df['mutation_matches_sequence'] == True]

In [39]:
print("Number of rows:", 
    mutation_df.shape[0])
print("Number of unique RefSeq nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])
print("Number of unique RefSeq amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA']).shape[0], "<--")
print("Number of unique RefSeq / Uniprot nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT', 'uniprot_id']).shape[0])
print("Number of unique RefSeq / Uniprot amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA', 'uniprot_id']).shape[0])

Number of rows: 2968
Number of unique RefSeq nucleotide mutations: 2853
Number of unique RefSeq amino acid mutations: 2851 <--
Number of unique RefSeq / Uniprot nucleotide mutations: 2867
Number of unique RefSeq / Uniprot amino acid mutations: 2865


In [40]:
mutation_df_3 = mutation_df.copy()

In [41]:
failed_3 = (
    set(mutation_df_2['Mutation_RefSeq_AA'] + '.' + mutation_df_2['uniparc_id'] + '.' + 
        mutation_df_2['uniprot_id']) - 
    set(mutation_df_3['Mutation_RefSeq_AA'] + '.' + mutation_df_3['uniparc_id'] + '.' + 
        mutation_df_3['uniprot_id'])
)

mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_3), 'errors'] = (
    mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_3), 'errors'] + 
    "RefSeq protein and UniProt protein do not have the same sequence. "
)

## Protein has domain(s)

Select only those mutations that fall inside a protein domain.

### mutation_df_4

In [42]:
mutation_df = mutation_df_3.copy()

In [43]:
sql_query = """\
select uniprot_domain_id, uniprot_id, domain_def, model_domain_def
from elaspic.uniprot_domain
join elaspic.uniprot_domain_template using (uniprot_domain_id)
left join elaspic.uniprot_domain_model using (uniprot_domain_id)
where uniprot_id in ('{}');
""".format("', '".join(set(mutation_df['uniprot_id'].values)))
uniprot_domain_model = pd.read_sql_query(sql_query, db_remote.engine)

In [44]:
mutation_df = (
    mutation_df
    .merge(uniprot_domain_model, on=['uniprot_id'])
)

In [45]:
mutation_df['mutation_in_domain'] = (
    mutation_df[['uniprot_mutation', 'model_domain_def']]
        .apply(lambda x: ascommon.sequence_tools.mutation_in_domain(*x), axis=1)
)

In [46]:
mutation_df.loc[mutation_df['mutation_in_domain'].isnull(), 'mutation_in_domain'] = (
    mutation_df.loc[mutation_df['mutation_in_domain'].isnull(), ['uniprot_mutation', 'domain_def']]
        .apply(lambda x: ascommon.sequence_tools.mutation_in_domain(*x), axis=1)
)

In [47]:
print("Number of rows:", 
    mutation_df.shape[0])
print("Number of unique RefSeq nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])
print("Number of unique RefSeq amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA']).shape[0], "<--")
print("Number of unique RefSeq / Uniprot nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT', 'uniprot_id']).shape[0])
print("Number of unique RefSeq / Uniprot amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA', 'uniprot_id']).shape[0])

Number of rows: 4134
Number of unique RefSeq nucleotide mutations: 2502
Number of unique RefSeq amino acid mutations: 2501 <--
Number of unique RefSeq / Uniprot nucleotide mutations: 2512
Number of unique RefSeq / Uniprot amino acid mutations: 2511


More than **~300** additional mutations don't fall inside a protein with *any* structural domains.

In [48]:
mutation_df_4 = mutation_df.copy()

In [49]:
failed_4 = (
    set(mutation_df_3['Mutation_RefSeq_AA'] + '.' + mutation_df_3['uniparc_id'] + '.' + 
        mutation_df_3['uniprot_id']) - 
    set(mutation_df_4['Mutation_RefSeq_AA'] + '.' + mutation_df_4['uniparc_id'] + '.' + 
        mutation_df_4['uniprot_id'])
)

mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_4), 'errors'] = (
    mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_4), 'errors'] + 
    "Protein has no structural domains. "
)

## Mutation in domain

### mutation_df_5

In [50]:
mutation_df = mutation_df_4.copy()

In [51]:
mutation_df = mutation_df[mutation_df['mutation_in_domain']]

In [52]:
print("Number of rows:", 
    mutation_df.shape[0])
print("Number of unique RefSeq nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])
print("Number of unique RefSeq amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA']).shape[0], "<--")
print("Number of unique RefSeq / Uniprot nucleotide mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_NT', 'uniprot_id']).shape[0])
print("Number of unique RefSeq / Uniprot amino acid mutations:", 
    mutation_df.drop_duplicates(subset=['Mutation_RefSeq_AA', 'uniprot_id']).shape[0])

Number of rows: 2016
Number of unique RefSeq nucleotide mutations: 1944
Number of unique RefSeq amino acid mutations: 1943 <--
Number of unique RefSeq / Uniprot nucleotide mutations: 1952
Number of unique RefSeq / Uniprot amino acid mutations: 1951


Another **~550** mutations don't fall inside a domain for which we have a structural model (mostly because there is no structural domain in that region, but also because we didn't get around to making that homology model yet).

In [53]:
mutation_df_5 =  mutation_df.copy()

In [54]:
failed_5 = (
    set(mutation_df_4['Mutation_RefSeq_AA'] + '.' + mutation_df_4['uniparc_id'] + '.' + 
        mutation_df_4['uniprot_id']) - 
    set(mutation_df_5['Mutation_RefSeq_AA'] + '.' + mutation_df_5['uniparc_id'] + '.' + 
        mutation_df_5['uniprot_id'])
)

mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_5), 'errors'] = (
    mutation_df_failed.loc[mutation_df_failed['unique_id'].isin(failed_5), 'errors'] + 
    "Mutation falls outside of all structural domains. "
)

In [55]:
mutation_df.shape

(2016, 23)

## ELASPIC

This is just to create a spreadsheet for Taipale.

### Load ELASPIC mutations

In [56]:
sql_query = """
select *
from elaspic.uniprot_domain_mutation
where (uniprot_id, mutation) in ({})
""".format(
    ', '.join(mutation_df_5[['uniprot_id', 'uniprot_mutation']]
        .apply(
            lambda x: "('{}', '{}')".format(*x), 
            axis=1)))

uniprot_domain_mutation = pd.read_sql_query(sql_query, db_remote.engine)

In [57]:
display(uniprot_domain_mutation.head(2))
uniprot_domain_mutation.shape

Unnamed: 0,uniprot_id,uniprot_domain_id,mutation,mutation_errors,model_filename_wt,model_filename_mut,chain_modeller,mutation_modeller,stability_energy_wt,stability_energy_mut,physchem_wt,physchem_wt_ownchain,physchem_mut,physchem_mut_ownchain,matrix_score,secondary_structure_wt,solvent_accessibility_wt,secondary_structure_mut,solvent_accessibility_mut,provean_score,ddg,mut_date_modified
0,O00142,60063967,H121N,,O00142_H121N/WT_RepairPDB_O00142_1j90A_1.pdb,O00142_H121N/MUT_RepairPDB_O00142_1j90A_1.pdb,A,H74N,"94.3439,-128.609,-56.6024,-235.416,-6.26075,32...","95.8511,-128.56,-55.735,-234.802,-6.26087,324....",0,451,0,451,0.0,H,2.69926,H,3.42948,-6.265,0.680524,2015-01-28 21:20:09
1,O00142,60063967,L215P,,O00142_L215P/WT_RepairPDB_O00142_1j90A_1.pdb,O00142_L215P/MUT_RepairPDB_O00142_1j90A_1.pdb,A,L168P,"94.6253,-128.711,-56.6448,-235.487,-6.22968,32...","98.9871,-126.901,-55.9311,-234.63,-6.16603,322...",0,42,0,42,-3.0,H,8.54224,H,17.9768,-6.515,4.399,2015-05-31 21:44:02


(1951, 22)

In [58]:
mutation_df_failed.head(2)

Unnamed: 0,Mutation_RefSeq_AA,uniparc_id,uniprot_id,uniprot_mutation,unique_id,errors
0,NP_000005:p.C972Y,UPI0000001C94,,,,Could not map UniParc to UniProt. RefSeq prote...
1,NP_000005:p.C972Y,UPI0000155718,P01023,C972Y,NP_000005:p.C972Y.UPI0000155718.P01023,


In [59]:
nonfailed = set(
    mutation_df_failed[mutation_df_failed['errors'] == '']['uniprot_id'] + '.' +
    mutation_df_failed[mutation_df_failed['errors'] == '']['uniprot_mutation']
)
elaspic_mutations = set(uniprot_domain_mutation['uniprot_id'] + '.' + uniprot_domain_mutation['mutation'])

In [60]:
failed_6 = set(mutation_df_5['uniprot_id'] + '.' + mutation_df_5['uniprot_mutation'])
failed_6 = set(x for x in failed_6 if x not in elaspic_mutations and x in nonfailed)

In [61]:
print2("Unexplaind failures:", len(failed_6))
# 37

Unexplaind failures:                                        2


In [62]:
failed_6

{'P02746.G42D', 'P35247.T180A'}

### Calculate missing

### Iterate on the above


    ...

### mutation_df_failed

In [None]:
mutation_df_failed.loc[
    (mutation_df_failed['uniprot_id'] + '.' + mutation_df_failed['uniprot_mutation']).isin(failed_6), 
    'errors'
] = (
    "Error making homology model (probably low sequence identity). "
)

In [None]:
assert not set(mutation_df_failed['Mutation_RefSeq_AA']) ^ set(mutation_df_1['Mutation_RefSeq_AA'])

# Load histone interaction data

### mmc2_s2a

In [None]:
mmc2_s2a = pd.read_excel('../downloads/taipale/mmc2.xlsx', 'Table S2A')

In [None]:
display(mmc2_s2a.head(2))
print(mmc2_s2a.shape[0])
print(mmc2_s2a.drop_duplicates(subset=['Mutation_RefSeq_NT']).shape[0])

assert not set(mmc2_s2a['Mutation_RefSeq_NT']) - set(mutation_df_1['Mutation_RefSeq_NT'])

In [None]:
mmc2_s2a.drop(pd.Index(['Category', 'Symbol', 'Entrez_Gene_ID', 'Allele_ID', 'Mutation_RefSeq_AA']), axis=1, inplace=True)

# Load FoldX predictions

### mmc2_s2b

In [None]:
mmc2_s2b = pd.read_excel('../downloads/taipale/mmc2.xlsx', 'Table S2B')

In [None]:
display(mmc2_s2b.head(2))

assert not set(mmc2_s2b['Mutation_RefSeq_NT']) - set(mutation_df_1['Mutation_RefSeq_NT'])

In [None]:
mmc2_s2b.drop(pd.Index(['Category', 'Symbol', 'Entrez_Gene_ID', 'Allele_ID', 'Mutation_RefSeq_AA']), axis=1, inplace=True)

# Save

### mutation_df_5

In [None]:
mutation_df = mutation_df_5.copy()

In [None]:
assert mutation_df['mutation_matches_sequence'].all()
assert mutation_df['mutation_in_domain'].all()

In [None]:
mutation_df[['uniprot_id', 'uniprot_mutation']].head()

In [None]:
mutation_df.columns = datapkg.format_columns(mutation_df.columns)

In [None]:
display(mutation_df.head(2))
print(mutation_df.shape[0])

In [None]:
columns_to_keep = [
    'category', 'symbol', 'entrez_gene_id', 'allele_id', 
    'mutation_refseq_nt', 'mutation_refseq_aa', 'hgmd_accession', 'hgmd_variant_class', 'db_snp_id', 'disease', 
    # RefSeq info (processed)
    'refseq_id', 'refseq_mutation',
    # UniProt info
    'uniparc_id', 'uniprot_id', 'uniprot_mutation', 'uniprot_domain_id', 'model_domain_def'
]

In [None]:
mutation_df[columns_to_keep].head(2)

In [None]:
print(mutation_df.shape[0])
print(mutation_df.drop_duplicates(subset=['mutation_refseq_nt']).shape[0])
print('RefSeq...')
print(mutation_df.drop_duplicates(subset=['refseq_id', 'refseq_mutation']).shape[0])
print(mutation_df.drop_duplicates(subset=['refseq_id', 'mutation_refseq_nt']).shape[0])
print('Uniprot...')
print(mutation_df.drop_duplicates(subset=['uniprot_id', 'refseq_mutation']).shape[0])
print(mutation_df.drop_duplicates(subset=['uniprot_id', 'mutation_refseq_nt']).shape[0])

In [None]:
t = db.import_df(
    mutation_df[columns_to_keep], 
    'taipale',
)

In [None]:
db.add_idx_column(t.name)

In [None]:
db.create_indexes(
    t.name, 
    [
        [('refseq_id', 'refseq_mutation'), False],
        [('uniprot_id', 'uniprot_mutation'), False],
        [('uniprot_id', 'mutation_refseq_nt'), False],
        [('uniprot_domain_id', 'uniprot_mutation'), False],
        (['mutation_refseq_nt', 'uniparc_id', 'uniprot_id'], True),
        (['mutation_refseq_aa'], False),
    ]
)

### mmc2_s2a

In [None]:
df = mmc2_s2a.copy()

In [None]:
df.columns = datapkg.format_columns(df.columns)

In [None]:
df.head()

In [None]:
df['elisa_score_diff'] = df['mut_elisa_score'] - df['wt_elisa_score']
df['interaction_score_diff'] = df['mut_interaction_score'] - df['wt_interaction_score']

In [None]:
df.head()

In [None]:
t = db.import_df(
    df,
    'taipale_chaperone',
)

In [None]:
db.add_idx_column(t.name)

In [None]:
db.create_indexes(
    t.name, 
    [
        (['mutation_refseq_nt'], False),
    ]
)

### mmc2_s2b

In [None]:
df = mmc2_s2b.copy()

In [None]:
df.columns = datapkg.format_columns(df.columns)

In [None]:
df.head()

In [None]:
df.head()

In [None]:
t = db.import_df(
    df, 
    'taipale_foldx',
)

In [None]:
db.add_idx_column(t.name)

In [None]:
db.create_indexes(
    t.name, 
    [
        (['mutation_refseq_nt'], False),
    ]
)

## mutation_df_failed

In [None]:
Counter(mutation_df_failed['errors'])

In [None]:
df = mutation_df_failed[mutation_df_failed['errors'] != ''].copy()

In [None]:
df.columns = datapkg.format_columns(df.columns)

In [None]:
df.head()

In [None]:
t = db.import_df(
    df, 
    'taipale_failed', 
)

In [None]:
db.add_idx_column(t.name)

In [None]:
db.create_indexes(
    t.name, 
    [
        (['mutation_refseq_aa'], False),
    ]
)