In [293]:
import pandas as pd
import numpy as np
import sqlite3

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, matthews_corrcoef

from scipy.stats import pearsonr, spearmanr

In [239]:
conn = sqlite3.connect('../data.db')

# CONTINUOUS

In [240]:
target = 'mllib_features_continuous_ZCoordinates'
target_len = 1
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [241]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment like "lstm%" or '
                          'experiment like "cnn%" or '
                          'experiment like "xgb%" or '
                          'experiment like "rf%") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [242]:
print(experiments)

['cnn_domain_zcoords', 'lstm_domain_zcoords', 'rf_zcoords_cont', 'xgb_zcoords_cont']


In [243]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [244]:
obss_valid[0].head()

Unnamed: 0,experiment,id,resi,val_0,name,fold,kind
0,observed,1nek_C,0,-25.0,depth,0,valid
1,observed,1nek_C,1,-25.0,depth,0,valid
2,observed,1nek_C,2,-25.0,depth,0,valid
3,observed,1nek_C,3,-25.0,depth,0,valid
4,observed,1nek_C,4,-25.0,depth,0,valid


In [245]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        preds_valid[-1].append(pd.read_sql_query(f'select * from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn))
        preds_test[-1].append(pd.read_sql_query(f'select * from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn))

In [246]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat(preds_valid[fold] + [obss_valid[fold]], axis=1)[target_cols])
    dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat(preds_test[fold] + [obss_test[fold]], axis=1)[target_cols])
    dfs_test[-1].columns = experiments + ['observed']

In [247]:
dfs_valid[0].head()

Unnamed: 0,cnn_domain_zcoords,lstm_domain_zcoords,rf_zcoords_cont,xgb_zcoords_cont,observed
0,-17.043632,-19.805127,-12.578974,-13.059041,-25.0
1,-22.117513,-20.962265,-12.664031,-18.813404,-25.0
2,-24.30962,-22.045821,-11.721556,-13.982627,-25.0
3,-22.558057,-22.598574,-11.748129,-10.240783,-25.0
4,-24.289432,-22.849461,-12.560366,-20.070454,-25.0


In [248]:
lrs = []
for fold in range(max_folds):
    lrs.append(LinearRegression().fit(dfs_valid[fold][experiments], dfs_valid[fold]['observed']))

In [249]:
obs_valid = np.concatenate([df['observed'] for df in dfs_valid])

In [250]:
print(obs_valid)

[-25. -25. -25. ...  11.  14.  14.]


In [251]:
pred_valid = np.concatenate([lrs[fold].predict(dfs_valid[fold][experiments]) for fold in range(max_folds)])

In [252]:
print(pred_valid)

[-19.65381556 -20.53113363 -21.43666584 ...  11.28181507  13.40923066
  13.50911748]


In [253]:
mse_valid = mean_squared_error(obs_valid, pred_valid)
mae_valid = mean_absolute_error(obs_valid, pred_valid)
pr_valid = pearsonr(obs_valid, pred_valid)
sp_valid = spearmanr(obs_valid, pred_valid)

print(list(map(lambda x: round(x, 3), [pr_valid[0], sp_valid[0], mae_valid, mse_valid])))

[0.948, 0.942, 3.526, 34.354]


In [254]:
obs_test = np.concatenate([df['observed'] for df in dfs_test])

In [255]:
print(obs_test)

[-15.24400043 -12.18799973 -13.01900005 ...  25.          25.
  25.        ]


In [256]:
pred_test = np.concatenate([lrs[fold].predict(dfs_test[fold][experiments]) for fold in range(max_folds)])

In [257]:
print(pred_test)

[-13.55311078 -13.10448993 -12.47658741 ...   7.14068941   5.09855448
   4.13684788]


In [258]:
mse_test = mean_squared_error(obs_test, pred_test)
mae_test = mean_absolute_error(obs_test, pred_test)
pr_test = pearsonr(obs_test, pred_test)
sp_test = spearmanr(obs_test, pred_test)

print(list(map(lambda x: round(x, 3), [pr_test[0], sp_test[0], mae_test, mse_test])))

[0.807, 0.804, 5.723, 114.677]


In [259]:
for fold in range(max_folds):
    print(f'Fold {fold}')
    for experiment, coef in zip(experiments, lrs[fold].coef_):
        print(f'experiment: {experiment}, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')

Fold 0
experiment: cnn_domain_zcoords, coef: -0.05, intercept: -1.23
experiment: lstm_domain_zcoords, coef: 0.96, intercept: -1.23
experiment: rf_zcoords_cont, coef: 0.02, intercept: -1.23
experiment: xgb_zcoords_cont, coef: 0.00, intercept: -1.23
Fold 1
experiment: cnn_domain_zcoords, coef: 0.01, intercept: 0.18
experiment: lstm_domain_zcoords, coef: 1.01, intercept: 0.18
experiment: rf_zcoords_cont, coef: 0.16, intercept: 0.18
experiment: xgb_zcoords_cont, coef: -0.06, intercept: 0.18
Fold 2
experiment: cnn_domain_zcoords, coef: 0.02, intercept: 1.20
experiment: lstm_domain_zcoords, coef: 0.98, intercept: 1.20
experiment: rf_zcoords_cont, coef: 0.06, intercept: 1.20
experiment: xgb_zcoords_cont, coef: -0.12, intercept: 1.20
Fold 3
experiment: cnn_domain_zcoords, coef: -0.02, intercept: -1.51
experiment: lstm_domain_zcoords, coef: 0.97, intercept: -1.51
experiment: rf_zcoords_cont, coef: -0.06, intercept: -1.51
experiment: xgb_zcoords_cont, coef: 0.06, intercept: -1.51
Fold 4
experime

# BINARY

In [260]:
target = 'mllib_features_binary_Bfactors'
target_len = 1
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [261]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment like "lstm%" or '
                          'experiment like "cnn%" or '
                          'experiment like "xgb%" or '
                          'experiment like "rf%") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [262]:
print(experiments)

['cnn_domain_bfactors', 'lstm_domain_bfactors', 'rf_bfactors_bin', 'xgb_bfactors_bin']


In [263]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [264]:
obss_valid[0].head()

Unnamed: 0,experiment,id,resi,val_0,name,fold,kind
0,observed,1nek_C,0,1.0,depth,0,valid
1,observed,1nek_C,1,1.0,depth,0,valid
2,observed,1nek_C,2,1.0,depth,0,valid
3,observed,1nek_C,3,1.0,depth,0,valid
4,observed,1nek_C,4,1.0,depth,0,valid


In [265]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        preds_valid[-1].append(pd.read_sql_query(f'select * from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn))
        preds_test[-1].append(pd.read_sql_query(f'select * from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn))

In [266]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat(preds_valid[fold] + [obss_valid[fold]], axis=1)[target_cols])
    dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat(preds_test[fold] + [obss_test[fold]], axis=1)[target_cols])
    dfs_test[-1].columns = experiments + ['observed']

In [267]:
dfs_valid[0].head()

Unnamed: 0,cnn_domain_bfactors,lstm_domain_bfactors,rf_bfactors_bin,xgb_bfactors_bin,observed
0,0.697402,0.797277,0.665,0.734198,1.0
1,0.706198,0.79489,0.578,0.713786,1.0
2,0.729441,0.835546,0.563,0.607813,1.0
3,0.690561,0.850896,0.517,0.563066,1.0
4,0.672404,0.839514,0.507,0.63869,1.0


In [270]:
lrs = []
for fold in range(max_folds):
    lrs.append(LogisticRegression().fit(dfs_valid[fold][experiments], dfs_valid[fold]['observed']))

In [271]:
obs_valid = np.concatenate([df['observed'] for df in dfs_valid])

In [272]:
print(obs_valid)

[1. 1. 1. ... 1. 1. 1.]


In [284]:
pred_valid = np.concatenate([lrs[fold].predict_proba(dfs_valid[fold][experiments])[:, 1] for fold in range(max_folds)])

In [285]:
print(pred_valid)

[0.80424121 0.80350017 0.8175293  ... 0.75780174 0.79064733 0.80580833]


In [300]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_valid, pred_valid), 3))

acc 0.684
p 0.635
r 0.486
f1 0.55
auc 0.735
mcc 0.321


In [301]:
obs_test = np.concatenate([df['observed'] for df in dfs_test])

In [302]:
print(obs_test)

[1. 1. 1. ... 0. 1. 1.]


In [303]:
pred_test = np.concatenate([lrs[fold].predict_proba(dfs_test[fold][experiments])[:, 1] for fold in range(max_folds)])

In [304]:
print(pred_test)

[0.70620855 0.73938313 0.70434053 ... 0.59602524 0.76995504 0.77514956]


In [305]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_test, pred_test), 3))

acc 0.686
p 0.638
r 0.475
f1 0.544
auc 0.73
mcc 0.321


In [310]:
#for fold in range(max_folds):
#    print(f'Fold {fold}')
#    for experiment, coef in zip(experiments, lrs[fold].coef_):
#        print(f'experiment: {experiment}#, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')

# CATEGORICAL

In [329]:
target = 'mllib_features_categorical_Topology'
target_len = 4
target_cols = [f'val_{i}' for i in range(target_len)]
max_folds = 5

In [315]:
experiments = [x[0] for x in conn.cursor().execute(f'select distinct experiment from {target} where '
                          '(experiment like "lstm%" or '
                          'experiment like "cnn%" or '
                          'experiment like "xgb%" or '
                          'experiment like "rf%") and '
                          'experiment not like "%VALID%" and experiment not like "%TEST%"'
                          'order by experiment'
                         ).fetchall()]

In [316]:
print(experiments)

['cnn_domain_zcoords', 'lstm_domain_zcoords', 'rf_topo', 'xgb_topo']


In [317]:
obss_valid = []
obss_test = []
for fold in range(max_folds):
    obss_valid.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="valid" and experiment="observed" order by id,resi', conn))
    obss_test.append(pd.read_sql_query(f'select * from {target} join datasets using (id) where fold={fold} and kind="test" and experiment="observed" order by id,resi', conn))

In [318]:
obss_valid[0].head()

Unnamed: 0,experiment,id,resi,val_0,val_1,val_2,val_3,name,fold,kind
0,observed,1nek_C,0,1.0,0.0,0.0,0.0,depth,0,valid
1,observed,1nek_C,1,1.0,0.0,0.0,0.0,depth,0,valid
2,observed,1nek_C,2,1.0,0.0,0.0,0.0,depth,0,valid
3,observed,1nek_C,3,1.0,0.0,0.0,0.0,depth,0,valid
4,observed,1nek_C,4,1.0,0.0,0.0,0.0,depth,0,valid


In [322]:
preds_valid = []
preds_test = []
for fold in range(max_folds):
    preds_valid.append([])
    preds_test.append([])
    for experiment in experiments:
        preds_valid[-1].append(pd.read_sql_query(f'select * from {target} where experiment="{experiment}_VALID_{fold}" order by id,resi', conn))
        preds_test[-1].append(pd.read_sql_query(f'select * from {target} join datasets using (id) where experiment="{experiment}" and kind="test" and fold={fold} order by id,resi', conn))

In [334]:
dfs_valid = []
dfs_test = []
for fold in range(max_folds):
    dfs_valid.append(pd.concat(preds_valid[fold] + [obss_valid[fold]], axis=1)[target_cols])
    #dfs_valid[-1].columns = experiments + ['observed']
    dfs_test.append(pd.concat(preds_test[fold] + [obss_test[fold]], axis=1)[target_cols])
    #dfs_test[-1].columns = experiments + ['observed']

In [335]:
dfs_valid[0].head()

Unnamed: 0,val_0,val_0.1,val_0.2,val_0.3,val_0.4,val_1,val_1.1,val_1.2,val_1.3,val_1.4,val_2,val_2.1,val_2.2,val_2.3,val_2.4,val_3,val_3.1,val_3.2,val_3.3,val_3.4
0,0.906432,0.901406,0.738,0.954485,1.0,0.009265,0.016593,0.04,0.000868,0.0,0.080684,0.061896,0.218,0.043141,0.0,0.003619,0.020104,0.004,0.001506,0.0
1,0.944222,0.918366,0.727,0.95229,1.0,0.00431,0.007558,0.036,0.000667,0.0,0.049833,0.062944,0.227,0.046025,0.0,0.001635,0.011133,0.01,0.001018,0.0
2,0.946508,0.92871,0.731,0.96264,1.0,0.001961,0.003742,0.053,0.001082,0.0,0.050773,0.061128,0.202,0.035319,0.0,0.000757,0.006419,0.014,0.000959,0.0
3,0.961855,0.926088,0.772,0.970711,1.0,0.000691,0.002452,0.039,0.000588,0.0,0.03707,0.066654,0.178,0.028244,0.0,0.000384,0.004806,0.011,0.000457,0.0
4,0.974665,0.922259,0.749,0.968932,1.0,0.000328,0.001816,0.046,0.000811,0.0,0.024852,0.071929,0.189,0.029429,0.0,0.000155,0.003996,0.016,0.000828,0.0


In [270]:
lrs = []
for fold in range(max_folds):
    lrs.append(LogisticRegression().fit(dfs_valid[fold][experiments], dfs_valid[fold]['observed']))

In [271]:
obs_valid = np.concatenate([df['observed'] for df in dfs_valid])

In [272]:
print(obs_valid)

[1. 1. 1. ... 1. 1. 1.]


In [284]:
pred_valid = np.concatenate([lrs[fold].predict_proba(dfs_valid[fold][experiments])[:, 1] for fold in range(max_folds)])

In [285]:
print(pred_valid)

[0.80424121 0.80350017 0.8175293  ... 0.75780174 0.79064733 0.80580833]


In [300]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_valid, pred_valid), 3))

acc 0.684
p 0.635
r 0.486
f1 0.55
auc 0.735
mcc 0.321


In [301]:
obs_test = np.concatenate([df['observed'] for df in dfs_test])

In [302]:
print(obs_test)

[1. 1. 1. ... 0. 1. 1.]


In [303]:
pred_test = np.concatenate([lrs[fold].predict_proba(dfs_test[fold][experiments])[:, 1] for fold in range(max_folds)])

In [304]:
print(pred_test)

[0.70620855 0.73938313 0.70434053 ... 0.59602524 0.76995504 0.77514956]


In [305]:
measures = [('acc', lambda o, p: accuracy_score(o, np.round(p))), 
            ('p', lambda o, p: precision_score(o, np.round(p))), 
            ('r', lambda o, p: recall_score(o, np.round(p))), 
            ('f1', lambda o, p: f1_score(o, np.round(p))), 
            ('auc', roc_auc_score), 
            ('mcc', lambda o, p: matthews_corrcoef(o, np.round(p)))
           ]

for name, measure in measures:
    print(name, round(measure(obs_test, pred_test), 3))

acc 0.686
p 0.638
r 0.475
f1 0.544
auc 0.73
mcc 0.321


In [310]:
#for fold in range(max_folds):
#    print(f'Fold {fold}')
#    for experiment, coef in zip(experiments, lrs[fold].coef_):
#        print(f'experiment: {experiment}#, coef: {coef:.2f}, intercept: {lrs[fold].intercept_:.2f}')