In [500]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

from scipy.stats import pearsonr, spearmanr

In [501]:
conn = sqlite3.connect('../data.db')

# CONTINUOUS

In [502]:
target = 'mllib_features_continuous_ZCoordinates'
target_len = 1
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [503]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment glob "lstm_*" or '
                          'experiment glob "cnn_*" or '
                          'experiment glob "xgb_*" or '
                          'experiment glob "rf_*") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [504]:
print(experiments)

['cnn_domain_zcoords', 'lstm_domain_zcoords', 'rf_zcoords_cont', 'xgb_zcoords_cont']


In [505]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [506]:
obss_valid[0].head()

Unnamed: 0,experiment,id,resi,val_0,name,fold,kind
0,observed,1nek_C,0,-25.0,depth,0,valid
1,observed,1nek_C,1,-25.0,depth,0,valid
2,observed,1nek_C,2,-25.0,depth,0,valid
3,observed,1nek_C,3,-25.0,depth,0,valid
4,observed,1nek_C,4,-25.0,depth,0,valid


In [507]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        preds_valid[-1].append(pd.read_sql_query(f'select * from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn))
        preds_test[-1].append(pd.read_sql_query(f'select * from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn))

In [508]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat(preds_valid[fold] + [obss_valid[fold]], axis=1)[target_cols])
    dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat(preds_test[fold] + [obss_test[fold]], axis=1)[target_cols])
    dfs_test[-1].columns = experiments + ['observed']

In [509]:
dfs_valid[0].head()

Unnamed: 0,cnn_domain_zcoords,lstm_domain_zcoords,rf_zcoords_cont,xgb_zcoords_cont,observed
0,-20.143442,-19.319791,-12.578974,-13.059041,-25.0
1,-21.910083,-21.595019,-12.664031,-18.813404,-25.0
2,-23.870161,-24.109092,-11.721556,-13.982627,-25.0
3,-24.186015,-25.0,-11.748129,-10.240783,-25.0
4,-21.088664,-25.0,-12.560366,-20.070454,-25.0


In [510]:
lrs = []
for fold in range(max_folds):
    lrs.append(LinearRegression().fit(dfs_valid[fold][experiments], dfs_valid[fold]['observed']))

In [511]:
obs_valid = np.concatenate([df['observed'] for df in dfs_valid])

In [512]:
print(obs_valid)

[-25. -25. -25. ...  25.  25.  25.]


In [513]:
pred_valid = np.concatenate([lrs[fold].predict(dfs_valid[fold][experiments]) for fold in range(max_folds)])

In [514]:
print(pred_valid)

[-22.41299257 -24.83797613 -27.28364207 ...  12.38142982  14.48248321
  14.57924854]


In [515]:
mse_valid = mean_squared_error(obs_valid, pred_valid)
mae_valid = mean_absolute_error(obs_valid, pred_valid)
pr_valid = pearsonr(obs_valid, pred_valid)
sp_valid = spearmanr(obs_valid, pred_valid)

print(list(map(lambda x: round(x, 3), [pr_valid[0], sp_valid[0], mae_valid, mse_valid])))

[0.891, 0.857, 6.768, 101.449]


In [516]:
obs_test = np.concatenate([df['observed'] for df in dfs_test])

In [517]:
print(obs_test)

[-25. -25. -25. ...  25.  25.  25.]


In [518]:
pred_test = np.concatenate([lrs[fold].predict(dfs_test[fold][experiments]) for fold in range(max_folds)])

In [519]:
print(pred_test)

[-4.31559664 -3.43432918 -2.59324601 ...  4.78688396  2.66807502
  1.33622439]


In [520]:
mse_test = mean_squared_error(obs_test, pred_test)
mae_test = mean_absolute_error(obs_test, pred_test)
pr_test = pearsonr(obs_test, pred_test)
sp_test = spearmanr(obs_test, pred_test)

print(list(map(lambda x: round(x, 3), [pr_test[0], sp_test[0], mae_test, mse_test])))

[0.792, 0.766, 9.589, 198.082]


In [521]:
for fold in range(max_folds):
    print(f'Fold {fold}')
    for experiment, coef in zip(experiments, lrs[fold].coef_):
        print(f'experiment: {experiment}, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')

Fold 0
experiment: cnn_domain_zcoords, coef: 0.04, intercept: 0.88
experiment: lstm_domain_zcoords, coef: 1.03, intercept: 0.88
experiment: rf_zcoords_cont, coef: 0.22, intercept: 0.88
experiment: xgb_zcoords_cont, coef: 0.00, intercept: 0.88
Fold 1
experiment: cnn_domain_zcoords, coef: 0.12, intercept: 1.05
experiment: lstm_domain_zcoords, coef: 1.04, intercept: 1.05
experiment: rf_zcoords_cont, coef: 0.24, intercept: 1.05
experiment: xgb_zcoords_cont, coef: 0.07, intercept: 1.05
Fold 2
experiment: cnn_domain_zcoords, coef: 0.02, intercept: 1.17
experiment: lstm_domain_zcoords, coef: 0.98, intercept: 1.17
experiment: rf_zcoords_cont, coef: 0.07, intercept: 1.17
experiment: xgb_zcoords_cont, coef: -0.12, intercept: 1.17
Fold 3
experiment: cnn_domain_zcoords, coef: 0.08, intercept: -0.04
experiment: lstm_domain_zcoords, coef: 1.04, intercept: -0.04
experiment: rf_zcoords_cont, coef: 0.08, intercept: -0.04
experiment: xgb_zcoords_cont, coef: 0.12, intercept: -0.04
Fold 4
experiment: cnn_

# BINARY

In [260]:
target = 'mllib_features_binary_Bfactors'
target_len = 1
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [261]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment like "lstm%" or '
                          'experiment like "cnn%" or '
                          'experiment like "xgb%" or '
                          'experiment like "rf%") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [262]:
print(experiments)

['cnn_domain_bfactors', 'lstm_domain_bfactors', 'rf_bfactors_bin', 'xgb_bfactors_bin']


In [263]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [264]:
obss_valid[0].head()

Unnamed: 0,experiment,id,resi,val_0,name,fold,kind
0,observed,1nek_C,0,1.0,depth,0,valid
1,observed,1nek_C,1,1.0,depth,0,valid
2,observed,1nek_C,2,1.0,depth,0,valid
3,observed,1nek_C,3,1.0,depth,0,valid
4,observed,1nek_C,4,1.0,depth,0,valid


In [265]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        preds_valid[-1].append(pd.read_sql_query(f'select * from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn))
        preds_test[-1].append(pd.read_sql_query(f'select * from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn))

In [266]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat(preds_valid[fold] + [obss_valid[fold]], axis=1)[target_cols])
    dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat(preds_test[fold] + [obss_test[fold]], axis=1)[target_cols])
    dfs_test[-1].columns = experiments + ['observed']

In [267]:
dfs_valid[0].head()

Unnamed: 0,cnn_domain_bfactors,lstm_domain_bfactors,rf_bfactors_bin,xgb_bfactors_bin,observed
0,0.697402,0.797277,0.665,0.734198,1.0
1,0.706198,0.79489,0.578,0.713786,1.0
2,0.729441,0.835546,0.563,0.607813,1.0
3,0.690561,0.850896,0.517,0.563066,1.0
4,0.672404,0.839514,0.507,0.63869,1.0


In [270]:
lrs = []
for fold in range(max_folds):
    lrs.append(LogisticRegression().fit(dfs_valid[fold][experiments], dfs_valid[fold]['observed']))

In [271]:
obs_valid = np.concatenate([df['observed'] for df in dfs_valid])

In [272]:
print(obs_valid)

[1. 1. 1. ... 1. 1. 1.]


In [284]:
pred_valid = np.concatenate([lrs[fold].predict_proba(dfs_valid[fold][experiments])[:, 1] for fold in range(max_folds)])

In [285]:
print(pred_valid)

[0.80424121 0.80350017 0.8175293  ... 0.75780174 0.79064733 0.80580833]


In [300]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_valid, pred_valid), 3))

acc 0.684
p 0.635
r 0.486
f1 0.55
auc 0.735
mcc 0.321


In [301]:
obs_test = np.concatenate([df['observed'] for df in dfs_test])

In [302]:
print(obs_test)

[1. 1. 1. ... 0. 1. 1.]


In [303]:
pred_test = np.concatenate([lrs[fold].predict_proba(dfs_test[fold][experiments])[:, 1] for fold in range(max_folds)])

In [304]:
print(pred_test)

[0.70620855 0.73938313 0.70434053 ... 0.59602524 0.76995504 0.77514956]


In [305]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_test, pred_test), 3))

acc 0.686
p 0.638
r 0.475
f1 0.544
auc 0.73
mcc 0.321


In [310]:
#for fold in range(max_folds):
#    print(f'Fold {fold}')
#    for experiment, coef in zip(experiments, lrs[fold].coef_):
#        print(f'experiment: {experiment}#, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')

# CATEGORICAL

In [375]:
target = 'mllib_features_categorical_SecStruc'
target_len = 3
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [376]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment like "lstm%" or '
                          'experiment like "cnn%" or '
                          'experiment like "xgb%" or '
                          'experiment like "rf%") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [377]:
print(experiments)

['cnn_sec', 'lstm_sec', 'rf_sec', 'xgb_sec']


In [378]:
feature_cols = []
for experiment in experiments:
    for target_col in target_cols:
        feature_cols.append(f'{experiment}_{target_col}')
        
feature_cols

['cnn_sec_val_0',
 'cnn_sec_val_1',
 'cnn_sec_val_2',
 'lstm_sec_val_0',
 'lstm_sec_val_1',
 'lstm_sec_val_2',
 'rf_sec_val_0',
 'rf_sec_val_1',
 'rf_sec_val_2',
 'xgb_sec_val_0',
 'xgb_sec_val_1',
 'xgb_sec_val_2']

In [379]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select id, {",".join(target_cols)} from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select id, {",".join(target_cols)} from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [380]:
obss_valid[0].head()

Unnamed: 0,id,val_0,val_1,val_2
0,1nek_C,0.0,0.0,1.0
1,1nek_C,0.0,0.0,1.0
2,1nek_C,0.0,0.0,1.0
3,1nek_C,0.0,0.0,1.0
4,1nek_C,0.0,0.0,1.0


In [381]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        val = pd.read_sql_query(f'select {",".join(target_cols)} from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn)
        val.columns = [f'{experiment}_{col}' for col in val.columns]
        preds_valid[-1].append(val)
        tst = pd.read_sql_query(f'select {",".join(target_cols)} from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn)
        tst.columns = [f'{experiment}_{col}' for col in tst.columns]
        preds_test[-1].append(tst)

In [382]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat([obss_valid[fold]] + preds_valid[fold], axis=1))
    #dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat([obss_test[fold]] + preds_test[fold], axis=1))
    #dfs_test[-1].columns = experiments + ['observed']

In [383]:
dfs_valid[0].head()

Unnamed: 0,id,val_0,val_1,val_2,cnn_sec_val_0,cnn_sec_val_1,cnn_sec_val_2,lstm_sec_val_0,lstm_sec_val_1,lstm_sec_val_2,rf_sec_val_0,rf_sec_val_1,rf_sec_val_2,xgb_sec_val_0,xgb_sec_val_1,xgb_sec_val_2
0,1nek_C,0.0,0.0,1.0,0.200287,0.024883,0.77483,0.762536,0.006304,0.23116,0.441,0.141,0.418,0.520793,0.075567,0.40364
1,1nek_C,0.0,0.0,1.0,0.375613,0.020773,0.603614,0.819807,0.003242,0.176951,0.473,0.079,0.448,0.579424,0.080823,0.339753
2,1nek_C,0.0,0.0,1.0,0.759209,0.007068,0.233723,0.690405,0.005289,0.304305,0.445,0.09,0.465,0.366286,0.02246,0.611254
3,1nek_C,0.0,0.0,1.0,0.764356,0.011984,0.22366,0.47407,0.004591,0.521339,0.492,0.068,0.44,0.37742,0.04191,0.58067
4,1nek_C,0.0,0.0,1.0,0.774572,0.018595,0.206832,0.403448,0.004784,0.591768,0.482,0.067,0.451,0.335889,0.034939,0.629173


In [463]:
lrs = []
for fold in range(max_folds):
    # solver= str, {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default: ‘liblinear’.
    #multi_class = 'ovr' 'multinomial'
    lrs.append(LogisticRegression().fit(dfs_valid[fold][feature_cols], np.argmax(dfs_valid[fold][target_cols].values, axis=1)))
    print(lrs[-1].n_iter_)

[8]
[7]
[7]
[7]
[7]


In [464]:
obs_valid = np.concatenate([df[target_cols] for df in dfs_valid])

In [465]:
print(obs_valid)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [466]:
pred_valid = np.concatenate([lrs[fold].predict_proba(dfs_valid[fold][feature_cols]) for fold in range(max_folds)])

In [467]:
print(pred_valid)

[[0.34105271 0.01041344 0.64853385]
 [0.53801763 0.00772666 0.45425571]
 [0.71604401 0.00970109 0.2742549 ]
 ...
 [0.93587763 0.01217407 0.05194829]
 [0.89262182 0.01671146 0.09066671]
 [0.70572231 0.03113135 0.26314634]]


In [468]:
for i in range(target_len):
    print('auc', round(roc_auc_score(obs_valid[:, i], pred_valid[:, i]), 3))

auc 0.951
auc 0.955
auc 0.919


In [469]:
obs_test = np.concatenate([df[target_cols] for df in dfs_test])

In [470]:
print(obs_test)

[[0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 0. 1.]]


In [471]:
pred_test = np.concatenate([lrs[fold].predict_proba(dfs_test[fold][feature_cols]) for fold in range(max_folds)])

In [472]:
print(pred_test)

[[0.882346   0.00437122 0.11328278]
 [0.95678715 0.00346388 0.03974897]
 [0.96006899 0.00308199 0.03684902]
 ...
 [0.85823407 0.01851061 0.12325532]
 [0.54157473 0.0319469  0.42647837]
 [0.29400877 0.07513022 0.63086102]]


In [473]:
for i in range(target_len):
    print('auc', round(roc_auc_score(obs_test[:, i], pred_test[:, i]), 3))

auc 0.939
auc 0.947
auc 0.909


In [474]:
#for fold in range(max_folds):
#    print(f'Fold {fold}')
#    for experiment, coef in zip(experiments, lrs[fold].coef_):
#        print(f'experiment: {experiment}#, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')

In [475]:
auc 0.939
auc 0.947
auc 0.909

SyntaxError: invalid syntax (<ipython-input-475-585db11f694e>, line 1)

In [499]:
dfs_test[0].id.value_counts()['2z5x_A']

513