# Summary

----


# Imports

In [1]:
NOTEBOOK_NAME = 'protherm'

In [2]:
%run common_imports.ipynb

2016-04-26 18:36:38.482122


# Extract

In [3]:
ls ../downloads/

abdellah_et_al.tsv.gz  kineticdb.html  ProTherm.dat.gz
kineticdb2.html        pfd_all.html    rosetta_ddg.zip
kineticdb_all.html     pfd.html        small_studies.ods


In [4]:
!mkdir $NOTEBOOK_NAME

mkdir: cannot create directory ‘protherm’: File exists


In [5]:
!gzip -dkf ../downloads/ProTherm.deat.gz $NOTEBOOK_NAME/

gzip: ../downloads/ProTherm.deat.gz: No such file or directory
gzip: protherm/ is a directory -- ignored


# Load ProTherm

In [6]:
# Parse protherm into entry chunks
protherm_filename_full = '../downloads/ProTherm.dat.gz'
with gzip.open(protherm_filename_full, 'rt', encoding='cp437') as ifh:
    file_data_chunks = [x.strip() for x in ifh.read().split('//')]

In [7]:
# Convert entry chunks into rows and combine
COLUMNS = []

def get_unique_columns(columns):
    seen = set()
    for c in columns:
        i, c_orig = 0, i
        while c in seen:
            i += 1
            c = c_orig + '_{}'.format(i)
        seen.add(c)
        yield c

def convert_chunks_to_rows(file_data_chunks):
    global COLUMNS
    for i in range(len(file_data_chunks)):
        # Don't return an empty dict for an empty chunk
        if not file_data_chunks[i].strip(' \n'):
            continue
        column = None
        column_old = None
        columns = []
        row = defaultdict(list)
        for line in file_data_chunks[i].split('\n'):
            if line.startswith('***'):
                continue
            column_old = column if column else column_old
            column = line.split(' ')[0]
            column_version = 1
            while column in columns:
                column_version += 1
                column = line.split(' ')[0] + '_{}'.format(column_version)
            value = ' '.join(line.split(' ')[1:]).strip(' \n')
            if not column:
                if not column_old or not value:
                    error_message = (
                        'i: {}, line: {}, column: {}, column_old: {}, value: {}'.format(
                            i, line, column, column_old, value))
                    raise Exception(error_message)
                row[column_old][-1] += ' ' + value
            else:
                row[column].append(value)
                columns.append(column)
        if len(columns) > len(COLUMNS):
            COLUMNS = columns
        yield {key: ','.join(values) for (key, values) in row.items()}

rows = list(convert_chunks_to_rows(file_data_chunks))
df = pd.DataFrame(rows, columns=COLUMNS)

In [8]:
COLUMNS

['NO.',
 'PROTEIN',
 'SOURCE',
 'LENGTH',
 'MOL-WEIGHT',
 'PIR_ID',
 'SWISSPROT_ID',
 'E.C.NUMBER',
 'PMD.NO',
 'PDB_wild',
 'PDB_mutant',
 'MUTATION',
 'MUTATED_CHAIN',
 'NO_MOLECULE',
 'SEC.STR.',
 'ASA',
 'T',
 'pH',
 'BUFFER_NAME',
 'BUFFER_CONC',
 'ION_NAME_1',
 'ION_CONC_1',
 'ADDITIVES',
 'PROTEIN_CONC',
 'MEASURE',
 'METHOD',
 'dG_H2O',
 'ddG_H2O',
 'dG',
 'ddG',
 'Tm',
 'dTm',
 'dHvH',
 'dHcal',
 'm',
 'Cm',
 'dCp',
 'STATE',
 'REVERSIBILITY',
 'ACTIVITY',
 'ACTIVITY_Km',
 'ACTIVITY_Kcat',
 'ACTIVITY_Kd',
 'REVERSIBILITY_2',
 'ACTIVITY_2',
 'ACTIVITY_Km_2',
 'ACTIVITY_Kcat_2',
 'ACTIVITY_Kd_2',
 'KEY_WORDS',
 'REFERENCE',
 'AUTHOR',
 'REMARKS',
 'RELATED_ENTRIES']

In [9]:
renames = {'E.C.NUMBER': 'ec_number'}
df = df.rename(columns=lambda x: x.replace('.', '_').replace('-', '_').strip('_').lower())

In [10]:
df.head()

Unnamed: 0,no,protein,source,length,mol_weight,pir_id,swissprot_id,e_c_number,pmd_no,pdb_wild,pdb_mutant,mutation,mutated_chain,no_molecule,sec_str,asa,t,ph,buffer_name,buffer_conc,ion_name_1,ion_conc_1,additives,protein_conc,measure,method,dg_h2o,ddg_h2o,dg,ddg,tm,dtm,dhvh,dhcal,m,cm,dcp,state,reversibility,activity,activity_km,activity_kcat,activity_kd,reversibility_2,activity_2,activity_km_2,activity_kcat_2,activity_kd_2,key_words,reference,author,remarks,related_entries
0,1,Phospholipase A2,Bovine,130,14536.12,PSBOA,PA21_BOVIN (P00593),EC 3.4.23.4,A930651,1BP2,,wild,-,1,,,30.0,8.0,borate,10 mM,,,"EDTA (0.1 mM),",5 mM,CD,GdnHCl,9.5,,,,,,,,1.47,6.9,,,Unknown,,1.4,675,,,,,,,catalytic triad; PLA2; conformational stabilit...,"J AM CHEM SOC 115, 8523-8526 (1993) PMID:",Li Y. & Tsai M.-D.,"additive : EDTA(0.1 mM),",234
1,2,Phospholipase A2,Bovine,130,14513.08,PSBOA,PA21_BOVIN (P00593),EC 3.4.23.4,A930651,1BP2,,H 48 N,-,1,Helix,17.1,30.0,8.0,borate,10 mM,,,"EDTA (0.1 mM),",5 mM,CD,GdnHCl,6.5,-3.0,,,,,,,1.2,5.4,,,Unknown,,2.6,0.04,,,,,,,catalytic triad; PLA2; conformational stabilit...,"J AM CHEM SOC 115, 8523-8526 (1993) PMID:",Li Y. & Tsai M.-D.,"additive : EDTA(0.1 mM),",134
2,3,Phospholipase A2,Bovine,130,14527.11,PSBOA,PA21_BOVIN (P00593),EC 3.4.23.4,A930651,1BP2,,H 48 Q,-,1,Helix,17.1,30.0,8.0,borate,10 mM,,,"EDTA (0.1 mM),",5 mM,CD,GdnHCl,8.9,-0.6,,,,,,,1.34,6.6,,,Unknown,,,undetectable (<0.001),,,,,,,catalytic triad; PLA2; conformational stabilit...,"J AM CHEM SOC 115, 8523-8526 (1993) PMID:",Li Y. & Tsai M.-D.,"additive : EDTA(0.1 mM),",124
3,4,Phospholipase A2,Bovine,130,14470.06,PSBOA,PA21_BOVIN (P00593),EC 3.4.23.4,A930651,1BP2,,H 48 A,-,1,Helix,17.1,30.0,8.0,borate,10 mM,,,"EDTA (0.1 mM),",5 mM,CD,GdnHCl,6.4,-3.1,,,,,,,1.02,6.3,,,Unknown,,,undetectable (<0.001),,,,,,,catalytic triad; PLA2; conformational stabilit...,"J AM CHEM SOC 115, 8523-8526 (1993) PMID:",Li Y. & Tsai M.-D.,"additive : EDTA(0.1 mM),",123
4,5,Ribonuclease HI,Escherichia coli,155,17597.0,NRECH,RNH_ECOLI (P0A7Y4),EC 3.1.4.8,A920875,2RN2,,wild,-,1,,,,3.0,glycine-HCl,10 mM,,,,,CD,Thermal,,,,,49.8,0.0,98.1,,,,,,yes,100.0,,,,,,,,,structural stability; mutagenesis; free energy...,"J BIOL CHEM 267, 22014-22017 (1992) PMID: 1...","Kimura S., Kanaya S. & Nakamura H.",,"6,7,8,9,10,11,12,13,14,15,16,2143,2144,2145,21..."


In [11]:
print(df|.shape)

SyntaxError: invalid syntax (<ipython-input-11-b031f41e0108>, line 1)

In [None]:
# Replace empty strings with NaNs and remove all-empty rows
df[df == ''] = np.nan
df = df.dropna(subset=['NO.'])
print(df.shape)

In [None]:
df = df[~((df['pdb_wild'] == '1OTR') & (df['mutation'].str.startswith('A 33')))]
print(df.shape)

In [None]:
display(df.head(2))

In [None]:
def parse_swissprot_id(swissprot_id):
    """
    """
    uniprot_name_conversion = {
        'MK10_HUMAN': 'BRCA1_HUMAN',  # BRCA1
    }
    uniprot_id_conversion = {
        'P69542': 'P69543',
        'P53779': 'P38398',  # BRCA1
    }
    #
    if pd.isnull(swissprot_id):
        return np.nan, np.nan
    row = list(filter(lambda x: x, map(lambda x: x.strip(' ()'), swissprot_id.split())))
    if len(row) == 0:
        uniprot_name, uniprot_id = None, None
    elif len(row) == 1:
        uniprot_name, uniprot_id = row[0], None
    elif len(row) == 2:
        uniprot_name, uniprot_id = row
    else:
        raise Exception('Unknown row: {}, swissprot_id: {}'.format(row, swissprot_id))
    uniprot_name = uniprot_name_conversion.get(uniprot_name, uniprot_name)
    uniprot_id = uniprot_id_conversion.get(uniprot_id, uniprot_id)
    return uniprot_name, uniprot_id

df['uniprot_name_protherm'], df['uniprot_id_protherm'] = zip(*df['swissprot_id'].apply(parse_swissprot_id))

In [None]:
df['protein_name'] = df['protein']

In [None]:
pdb_id_conversion = {
    'érf5v': '3f5v',
    '1bgl': '4v40',
}
df['pdb_id'] = df['PDB_wild'].apply(lambda x: pdb_id_conversion.get(x, x)).str.lower()

In [None]:
df['pdb_chain'] = df['mutated_Chain']

In [None]:
df['pdb_mutation'] = df['mutation'].str.upper()

In [None]:
import protherm
reload(protherm)
protherm.SIFTS_CACHE_DIR = '/tmp/strokach'
os.makedirs(protherm.SIFTS_CACHE_DIR, exist_ok=True)

df['uniprot_id'], df['uniprot_mutation'], df['pdb_mutation_sifts'] = zip(*
    df[['pdb_id', 'pdb_mutation', 'uniprot_id_protherm']].apply(protherm.get_uniprot_id_mutation_protherm, axis=1)
)

### Thermodynamic features

In [None]:
thermodynamic_parameters = [
    'dG_H2O', 'dG', 'Tm', 'ddG_H2O', 'ddG', 'dTm', 'dHvH'
]


import re
RANGE_RE = re.compile('(\d+)-(\d+)')


def process_params(value):
    # Skip bad values
    # value = ''.join(row[1:])
    if pd.isnull(value):
        return np.nan
    
    if any(v in value for v in ['<', '>', 'Unknown', 'n.d.', 'NO_MOLECULE', 'dimer']):
        print("Could not convert value '{}' to float because it contains a blacklisted character.".format(value))
        return np.nan
    
    # Clean value
    value = value.replace(',', '').replace('/K', '').rstrip('.').lower()
    
    # Convert to float
    conversion = {
        '': 1,
        'kcal/mol': 1,
        'kcal/mole': 1,
        'cal/mol': 0.001,
        'cal/mole': 0.001,
        'kal/mol': 0.001,
        'kal/mole': 0.001,
        'kj/mol': 0.239001,
        'kj/mole': 0.239001,
    }
    new_value = None
    for suffix, cf in conversion.items():
        try:
            new_value = cf * float(value.strip(suffix))
            break
        except ValueError:
            pass

    match = RANGE_RE.findall(value)
    if len(match) == 1:
        value_min, value_max = [float(x) for x in match[0]]
        if (value_max - value_min) < 4:
            new_value = (value_min + value_max) / 2
            print("Converted '{}' to '{}'...".format(value, new_value))

    # Report errors
    if new_value is None:
        print("Could not convert value '{}' to float! Skipping...".format(value))
        
    return new_value

for column in thermodynamic_parameters:
    print('\n' + column)
    df[column.lower() + '_clean'] = df[column].apply(process_params)

In [None]:
df['ddg_exp'] = df[['ddg_h2o_clean', 'ddg_clean']].apply(lambda x: x[0] if pd.notnull(x[0]) else x[1], axis=1)

In [None]:
column_names = [
    'errors', 'protherm_no', 'pdb_id', 'protein_name', 'uniprot_name', 'uniprot_id',
    'mutated_pdb_chain', 'mutation', 'mutation_uniprot'
] + thermodynamic_parameters

missing_columns = [c for c in column_names if c not in df.columns]
print(missing_columns)

# Statistics

In [None]:
display(df.head())
print(df.shape[0])

In [None]:
# Totally failed
df[['uniprot_id', 'uniprot_mutation']].isnull().any(axis=1).sum()

In [None]:
# Amino acid mismatch
df['uniprot_mutation'].apply(lambda x: '?' in x if pd.notnull(x) else np.nan).sum()

In [None]:
Counter(df['pdb_chain'])

In [None]:
df.head()

In [None]:
df[df['pdb_mutation'] != df['pdb_mutation_sifts']][['pdb_mutation', 'pdb_mutation_sifts']].head(10)

In [None]:
x, y = zip(*df[['ddg_h2o_clean', 'ddg_exp']].dropna().values)
plt.scatter(x, y)

In [None]:
x, y = zip(*df[['ddg_clean', 'ddg_exp']].dropna().values)
plt.scatter(x, y)

# Save

In [None]:
import csv2sql
db = csv2sql.DataFrameToMySQL(
    os.environ['BIODB_CONNECTION_STR'] + '/protein_folding_energy', 
    NOTEBOOK_NAME, 
    os.environ['STG_SERVER_IP'], 
    echo=False
)

In [None]:
df.head()

In [None]:
db.import_table(
    df, 
    'protherm', [
        [('pdb_id', 'pdb_chain', 'pdb_mutation'), True],
        [('uniprot_id', 'uniprot_mutation'), False],
    ],
)