# Load data from ProTherm

In [1]:
__file__ = 'process_protherm_data'

In [2]:
%run common_imports.ipynb

2016-04-10 18:06:51.187473


In [20]:
#%% Manually correct some errors
# Acylphosphatase
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_name'])) &
    (protherm_df['protein_name'] == 'Acylphosphatase'),
    'uniprot_name'] = 'ACYP1_HUMAN'
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_id'])) &
    (protherm_df['protein_name'] == 'Acylphosphatase'),
    'uniprot_id'] = 'P07311'
mutations = protherm_df[
    (pd.notnull(protherm_df['mutation'])) &
    (protherm_df['protein_name'] == 'Acylphosphatase')]
protherm_df.loc[
    (pd.notnull(protherm_df['mutation'])) &
    (protherm_df['protein_name'] == 'Acylphosphatase'),
    'uniprot_mutation'] = mutations


# Alkaline phosphatase
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_name'])) &
    (protherm_df['protein_name'] == 'Alkaline phosphatase'),
    'uniprot_name'] = 'PPB_YEAST'
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_id'])) &
    (protherm_df['protein_name'] == 'Alkaline phosphatase'),
    'uniprot_id'] = 'P11491'


# Arginine kinase
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_name'])) &
    (protherm_df['protein_name'] == 'Arginine kinase'),
    'uniprot_name'] = 'KARG_DROME'
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_id'])) &
    (protherm_df['protein_name'] == 'Arginine kinase'),
    'uniprot_id'] = 'P48610'


# Eglin C
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_name'])) &
    (protherm_df['protein_name'] == 'Eglin C'),
    'uniprot_name'] = 'ICIC_HIRME'
protherm_df.loc[
    (pd.isnull(protherm_df['uniprot_id'])) &
    (protherm_df['protein_name'] == 'Eglin C'),
    'uniprot_id'] = 'P01051'


In [21]:
#%% Format error fields

# Select ddG_H2O if it is availible, else select ddG
protherm_df['ddg_best'] = [x[0] if pd.notnull(x[0]) else x[1] for x in protherm_df[['ddG_H2O', 'ddG']].values]

# Reverse the sign because protherm defines ddG and dTm as (mutant) - (wildtype)
# http://www.abren.net/protherm/protherm_knownproblems.php
change_columns = ['dTm', 'ddG', 'ddG_H2O', 'ddg_best']
for column in change_columns:
    protherm_df[column] = -protherm_df[column]


def format_errors(error_field):
    if not error_field:
        return ''
    error_messages = []
    error_subfields = [e.strip().strip(';') for e in error_field.split(':')]
    for i in range(len(error_subfields)-1):
        if error_subfields[i].endswith('Error'):
            error_message = error_subfields[i+1].rstrip('Line').strip(';').strip()
            if re.search('Cannot convert entry .* to float', error_message):
                error_message = 'Cannot convert entry to float'
            if error_message not in error_messages:
                error_messages.append(error_message)
    return '; ' + '; '.join(error_messages)


def classify_erros(error_messages):
    if not error_messages:
        return [0, 'Mapped successfully']
    if 'Wild-type protein' in error_messages:
        return [1, 'Wild-type']
    if 'No ddG and dTm score provided' in error_messages:
        return [1, 'No ddG and dTm scores']
    if ('Keeping only single mutation variants' in error_messages or
        'Only considering single amino acid substitutions' in error_messages):
            return [2, 'Multiple mutations']
    return [10, 'Mapping error']


def get_idx_where_better_exists(protherm_df):
    """Remove cases where the same uniprot_id-uniprot_mutation pair exists in a row with and without remarks
    """
    uniprot_mutation_no_remarks_ddg = set(
        protherm_df[
            pd.isnull(protherm_df['remarks']) &
            pd.notnull(protherm_df['ddg_best'])
        ][['uniprot_id', 'uniprot_mutation']].apply(tuple, axis=1)
    )
    uniprot_mutation_no_remarks_dtm = set(
        protherm_df[
            pd.isnull(protherm_df['remarks']) &
            pd.notnull(protherm_df['dTm'])
        ][['uniprot_id', 'uniprot_mutation']].apply(tuple, axis=1)
    )

    index_bad_ddg = protherm_df[
        (pd.notnull(protherm_df['remarks'])) &
        (pd.notnull(protherm_df['ddg_best'])) &
        (protherm_df[['uniprot_id', 'uniprot_mutation']]
            .apply(tuple, axis=1)
            .isin(uniprot_mutation_no_remarks_ddg))
    ].index

    index_bad_dtm = protherm_df[
        (pd.notnull(protherm_df['remarks'])) &
        (pd.isnull(protherm_df['ddg_best'])) &
        (pd.notnull(protherm_df['dTm'])) &
        (protherm_df[['uniprot_id', 'uniprot_mutation']]
            .apply(tuple, axis=1)
            .isin(uniprot_mutation_no_remarks_dtm))
    ].index

    return list(set(index_bad_ddg) | set(index_bad_dtm))


protherm_df['error_messages'] = protherm_df['errors'].apply(format_errors)

protherm_df['error_messages'] = (
    protherm_df['error_messages']
    .where(protherm_df['mutation'] != 'wild',
           protherm_df['error_messages'] + '; Wild-type protein')
)
protherm_df['error_messages'] = (
    protherm_df['error_messages']
    .where(pd.notnull(protherm_df['ddg_best']) | pd.notnull(protherm_df['dTm']),
           protherm_df['error_messages'] + '; No ddG and dTm score provided')
)
protherm_df['error_messages'] = (
    protherm_df['error_messages']
    .where(pd.notnull(protherm_df['uniprot_id']),
           protherm_df['error_messages'] + '; Uniprot id is missing')
)
protherm_df['error_messages'] = (
    protherm_df['error_messages']
    .where(pd.notnull(protherm_df['uniprot_mutation']),
           protherm_df['error_messages'] + '; Uniprot mutation is missing')
)

idxs_to_drop = get_idx_where_better_exists(protherm_df)
protherm_df.loc[idxs_to_drop, 'error_messages'] = (
    protherm_df.loc[idxs_to_drop, 'error_messages'] + '; A better version of this mutation exists'
)

# Check some of the `idxs_to_drop` mutations
#protherm_df.loc[16389, ['uniprot_id', 'uniprot_mutation', 'ddg_best', 'dTm', 'remarks', 'error_messages']]
#protherm_df[
#    (protherm_df['uniprot_id'] == 'P00720') &
#    (protherm_df['uniprot_mutation'] == 'V111A')
#][['uniprot_id', 'uniprot_mutation', 'ddg_best', 'dTm', 'remarks', 'error_messages']]

protherm_df['error_code'], protherm_df['error_category'] = zip(*protherm_df['error_messages'].apply(classify_erros))



In [None]:
#%% The same protein-mutation pair may occur multiple times
protherm_df_good = (
    protherm_df[
        (protherm_df['mutation'] != 'wild') &
        (protherm_df['error_messages'] == '')
    ]
)

# Average the thermidynamic parameters over all occurances of the same protein-mutation
data_columns = ['Tm', 'dG', 'dG_H2O', 'dHvH', 'dTm', 'ddG', 'ddG_H2O', 'ddg_best']
protherm_df_good_gp = protherm_df_good.groupby(['uniprot_id', 'uniprot_mutation'])
protherm_df_good_unique = (
    protherm_df_good_gp
    .agg(tuple)
    .merge(
        protherm_df_good_gp[data_columns]
        .agg(np.nanmean)
        .rename(columns=lambda c: c if c not in data_columns else c + '_mean'),
        left_index=True, right_index=True)
    .merge(
        protherm_df_good_gp[data_columns]
        .agg(np.nanstd)
        .rename(columns=lambda c: c if c not in data_columns else c + '_std'),
        left_index=True, right_index=True)
    .merge(
        protherm_df_good_gp[data_columns]
        .agg(np.nanmedian)
        .rename(columns=lambda c: c if c not in data_columns else c + '_median'),
        left_index=True, right_index=True)
    .merge(
        protherm_df_good_gp['mutation']
        .agg({'count': len}),
        left_index=True, right_index=True)
    .merge(
        protherm_df_good_gp
        .agg({'remarks': lambda x: tuple(x)}),
        left_index=True, right_index=True)
    .reset_index()
)

# Use ddG_H2O if it is availible, else ddG
def get_first_not_null(row):
    for value in row:
        if pd.notnull(value):
            return value
    return np.nan

protherm_df_good_unique['ddg_all_mean'] = (
    protherm_df_good_unique[['ddG_H2O_mean', 'ddG_mean']].apply(get_first_not_null, axis=1)
)
protherm_df_good_unique['ddg_all_median'] = (
    protherm_df_good_unique[['ddG_H2O_median', 'ddG_median']].apply(get_first_not_null, axis=1)
)
protherm_df_good_unique['ddg_all_std'] = (
    protherm_df_good_unique[['ddG_H2O_std', 'ddG_std']].apply(get_first_not_null, axis=1)
)

assert sum(protherm_df_good_unique.duplicated(subset=['uniprot_id', 'uniprot_mutation'])) == 0




In [None]:
if __name__ == '__main__':
    print(True)

In [None]:
#%% Add sequence information to each mutated protein
engine = sa.create_engine('mysql://elaspic:elaspic@192.168.6.19/uniprot_kb')
sql_query = """
select *
from uniprot_kb.uniprot_sequence
where uniprot_id in ('{}') ;
""".format("', '".join(protherm_df_good_unique['uniprot_id'].drop_duplicates()))
uniprot_sequences = pd.read_sql_query(sql_query, engine)

protherm_df_good_unique_wseq = protherm_df_good_unique.merge(uniprot_sequences, on=['uniprot_id'])
protherm_df_good_unique_wseq['sequence_match'] = [
    parsers.mutation_in_sequence(*x) for x
    in protherm_df_good_unique_wseq[['uniprot_mutation', 'uniprot_sequence']].values]
assert all(protherm_df_good_unique_wseq['sequence_match'])



In [None]:
#%%
protherm_df.to_pickle(
    constants.protherm_data_path + 'parsed_data{}/protherm_df.pickle'.format(version_suffix))
protherm_df_good.to_pickle(
    constants.protherm_data_path + 'parsed_data{}/protherm_df_good.pickle'.format(version_suffix))
protherm_df_good_unique.to_pickle(
    constants.protherm_data_path + 'parsed_data{}/protherm_df_good_unique.pickle'.format(version_suffix))
protherm_df_good_unique_wseq.to_pickle(
    constants.protherm_data_path + 'parsed_data{}/protherm_df_good_unique_wseq.pickle'.format(version_suffix))


