In [None]:
from src.train.split_data import get_data
from src.train.dataset import TANDEM_Dataset, get_test_data
import torch
from src.train.train import train_model
from torch.utils.tensorboard import SummaryWriter
import torch
from prody import LOGGER
import os
import numpy as np

config = {
    'model': {
        'input_shape': 33,
        'n_hidden': 5,
        # 'hidden_shape': 33,
        'output_shape': 2,
        'dropout': 0.2,
    },
    'train':{
        'n_epochs': 300,
        'patience': 50,
        'batch_size': 256,
        'lr': 1e-5,
        'l1': 0,
        'l2': 1e-4,
        
    },
    'experiment_name': 'DNN',
    'seed': 0,
    'model_folder': 'logs/models',
    'model_name': 'DNN',
}

clstr_path = './data/old/c30_clstr_May13_full_rhd.csv'
feat_path = 'data/R20000/fReplaceDYN_25Mar19.csv'

GJB2_path = 'data/GJB2/fReplaceDYNandPfam_25Mar18.csv'
RYR1_path = 'data/RYR1/fReplaceDYNandPfam_25Mar19.csv'
sel_feats=['consurf', 'wt_PSIC', 'Delta_PSIC', 'entropy', 'ACNR', 'sasa', 'BLOSUM', 'stiffness-chain', 'loop_percent', 'atomic_1', 'vector_2', 'co_rank', 'atomic_3', 'atomic_5', 'Dcom', 'vector_1', 'rank_2', 'eig_first', 'ranked_MI', 'delta_h_bond_group', 'phobic_percent', 'eig_sec', 'sheet_percent', 'gyradius', 'delta_polarity', 'side_chain_length', 'helix_percent', 'delta_side_chain_length', 'ANM_effectiveness-chain', 'rank_1', 'rmsf_overall', 'delta_charge', 'delta_phobic_percent']
config['model']['input_shape'] = len(sel_feats)
config['experiment_name'] = f'{len(sel_feats)}_features'
os.makedirs(f'logs/{config["experiment_name"]}', exist_ok=True)
LOGGER.start(f'logs/{config["experiment_name"]}/train.log')
LOGGER.info(f'Features: {sel_feats}')

folds, preprocess_feat = get_data(feat_path, clstr_path, sel_feats=sel_feats,_plot='ratio_description.png', folds='folds.pkl')
gjb2_data = get_test_data(GJB2_path, sel_feats, preprocess_feat, name='GJB2')
ryr1_data = get_test_data(RYR1_path, sel_feats, preprocess_feat, name='RYR1')
gjb2_VUS_ds = TANDEM_Dataset(gjb2_data['VUS'])
gjb2_notVUS_ds = TANDEM_Dataset(gjb2_data['notVUS'])
ryr1_VUS_ds = TANDEM_Dataset(ryr1_data['VUS'])
ryr1_notVUS_ds = TANDEM_Dataset(ryr1_data['notVUS'])

gjb2_VUS_loader = torch.utils.data.DataLoader(gjb2_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
gjb2_notVUS_loader = torch.utils.data.DataLoader(gjb2_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_VUS_loader = torch.utils.data.DataLoader(ryr1_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_notVUS_loader = torch.utils.data.DataLoader(ryr1_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)

for i in range(5):
    writer = SummaryWriter(log_dir=f'logs/{config["experiment_name"]}/fold_{i}')
    train_ds = TANDEM_Dataset(folds[i]['train'])
    val_ds = TANDEM_Dataset(folds[i]['val'])
    test_ds = TANDEM_Dataset(folds[i]['test'])
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=config['train']['batch_size'], shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=config['train']['batch_size'], shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=config['train']['batch_size'], shuffle=False)
    # Estimate the initial bias
    pos = np.sum(folds[i]['train'][1]==1)
    neg = np.sum(folds[i]['train'][1]==0)
    initial_bias = np.log([pos/neg])
    # convert to tensor
    initial_bias = torch.tensor(initial_bias, dtype=torch.float)
    LOGGER.info(f'Initial bias: {initial_bias}')
    train_model(config, train_loader, val_loader, 
                [test_loader, gjb2_notVUS_loader, ryr1_notVUS_loader],
                ['test', 'GJB2', 'RYR1'],
                writer, output_bias=initial_bias)
    writer.close()


In [18]:
import pandas as pd
df1 = '/mnt/nas_1/YangLab/loci/tandem/data/RYR1/RYR1-features.csv'
df2 = '/mnt/nas_1/YangLab/loci/tandem/data/RYR1/final_features.csv'

df1 = pd.read_csv(df1)
df2 = pd.read_csv(df2)
cols = df1.columns.tolist()[1:]
for i, row in df1.iterrows():
    sav = row['SAV_coords']
    sel = df2.loc[df2['SAV_coords'] == sav]
    for feat in cols:
        df1_feat = round(row[feat], 4)
        df2_feat = round(sel[feat].values[0], 4)
        if df1_feat != df2_feat:
            print(f'Feature {feat} not equal for {sav}')
            print(f'Row: {df1_feat}')
            print(f'Sel: {df2_feat}')
            # break
        

TypeError: type str doesn't define __round__ method

In [22]:
df1.to_csv('/mnt/nas_1/YangLab/loci/tandem/data/RYR1/RYR1-features.csv', index=False)

In [21]:
df1.drop(columns=['Unnamed: 0'], inplace=True)
df1

Unnamed: 0,SAV_coords,labels,GNM_Ventropy_full,GNM_rmsf_overall_full,GNM_Eigval1_full,GNM_Eigval2_full,GNM_Eigval5_1_full,GNM_SEall_full,GNM_SE20_full,GNM_V1_full,...,delta_phobic_percent,philic_percent,delta_philic_percent,charge,deltaCharge,polarity,deltaPolarity,charge_pH7,DELTA_charge_pH7,chain_length
0,P21817 35 C R,1.0,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.003481,...,0.000000,50.158794,0.000000,0.0,1.0,5.5,5.0,0.0,1.0,4299.0
1,P21817 44 R C,1.0,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.003692,...,0.000000,50.158794,0.000000,1.0,-1.0,10.5,-5.0,1.0,-1.0,4299.0
2,P21817 163 R C,1.0,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.003160,...,0.000000,50.158794,0.000000,1.0,-1.0,10.5,-5.0,0.0,0.0,4299.0
3,P21817 163 R L,1.0,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.003160,...,0.019849,50.158794,-0.019849,1.0,-1.0,10.5,-5.6,0.0,0.0,4299.0
4,P21817 248 G R,1.0,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.004452,...,-0.019849,50.158794,0.019849,0.0,1.0,9.0,1.5,0.0,1.0,4299.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,P21817 4234 V L,,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.002856,...,0.000000,50.158794,0.000000,0.0,0.0,5.9,-1.0,0.0,0.0,4299.0
116,P21817 4737 R Q,,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.001312,...,0.000000,50.158794,0.000000,1.0,-1.0,10.5,0.0,1.0,-1.0,4299.0
117,P21817 4824 L P,,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.000655,...,0.000000,50.158794,0.000000,0.0,0.0,4.9,3.1,0.0,0.0,4299.0
118,P21817 4893 R Q,,7991.661,0.729787,0.001179,0.001179,0.64386,0.898156,0.886682,0.000060,...,0.000000,50.158794,0.000000,1.0,-1.0,10.5,0.0,1.0,-1.0,4299.0


# Split data

In [None]:
# /mnt/nas_1/YangLab/loci/tandem/logs/Optimization_Tandem_NumberOfLayers/20250415-1420/n_hidden-5/evaluations.csv
from src.utils.settings import ROOT_DIR
from prody import LOGGER
import os
import pandas as pd
from src.train.split_data import split_data
from src.train.split_data import dist_protein_level,  dist_cluster_level

SAVs_path = os.path.join(ROOT_DIR, 'data/precomputed_features-ID_opt.tsv')
clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
savepath = os.path.join(ROOT_DIR, 'data/R20000/images')
log = os.path.join(savepath, 'description.txt')
LOGGER.start(log)
folds = split_data(SAVs_path, clstr_path, folder=savepath)
# Protein level
SAVs_path = os.path.join(ROOT_DIR, 'data/precomputed_features-ID_opt.tsv')
dist_protein_level(SAVs_path, folder='data/R20000')

# Cluster level
SAVs_path = os.path.join(ROOT_DIR, 'data/precomputed_features-ID_opt.tsv')
clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
dist_cluster_level(SAVs_path, clstr_path, folder='data/R20000')
LOGGER.exit()

In [None]:
print(5)

# Dataset

In [None]:
from src.train.split_data import get_data
from src.utils.settings import ROOT_DIR
import os 
from prody import LOGGER
savepath = os.path.join(ROOT_DIR, 'data/R20000/images')
sel_feats=['wtBJCE','deltaBJCE']
feat_path = os.path.join(ROOT_DIR, 'data/R20000/features_withNEWpfam_25Mar25.csv')
clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
folds, preprocess_feat = get_data(feat_path, clstr_path, sel_feats=sel_feats,folder=savepath)

# Correlation between features

In [None]:
# from src.train.split_data import get_data, feature_correlation
from src.utils.settings import ROOT_DIR
import os 
import pandas as pd 
# from prody import LOGGER
savepath = os.path.join(ROOT_DIR, 'data/R20000')
feat_path = os.path.join(ROOT_DIR, 'data/R20000/features_withNEWpfam_25Mar25.csv')
# feature_correlation(feat_path,folder=savepath)
df = pd.read_csv(feat_path)
df.columns

In [None]:
len(df.columns)

# Features statistics

In [None]:
# Head to feature_visualization

# Training

In [None]:
from src.train.split_data import get_data
from src.utils.settings import ROOT_DIR
import os 
from prody import LOGGER
savepath = os.path.join(ROOT_DIR, 'data/R20000/images')
sel_feats=['wtBJCE','deltaBJCE']
feat_path = os.path.join(ROOT_DIR, 'data/R20000/features_withNEWpfam_25Mar25.csv')
clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
folds, preprocess_feat = get_data(feat_path, clstr_path, sel_feats=sel_feats,folder=savepath)


from src.train.split_data import get_data
from src.train.dataset import TANDEM_Dataset, get_test_data
import torch
from src.train.train import train_model
from torch.utils.tensorboard import SummaryWriter
import torch
import numpy as np
from prody import LOGGER

config = {
    'model': {
        'input_shape': 33,
        'n_hidden': 5,
        # 'hidden_shape': 33,
        'output_shape': 2,
        'dropout': 0,
    },
    'train':{
        'n_epochs': 100,
        'patience': 50,
        'batch_size': 64,
        'lr': 1e-4,
        'l1': 0,
        'l2': 1e-4,
    },
    'experiment_name': 'DNN',
    'seed': 0,
    'model_folder': 'logs/models',
    'model_name': 'DNN',
}
LOGGER.start(f'logs/{config["experiment_name"]}/train.log')

clstr_path = './data/old/c30_clstr_May13_full_rhd.csv'
feat_path = 'data/R20000/fReplaceDYN_25Mar19.csv'
folds, preprocess_feat = get_data(feat_path, clstr_path, _plot='ratio_description.png', folds='folds.pkl')

GJB2_path = 'data/GJB2/fReplaceDYNandPfam_25Mar18.csv'
RYR1_path = 'data/RYR1/fReplaceDYNandPfam_25Mar19.csv'
sel_feats=['consurf', 'wt_PSIC', 'Delta_PSIC', 'entropy', 'ACNR', 'sasa', 'BLOSUM', 'stiffness-chain', 'loop_percent', 'atomic_1', 'vector_2', 'co_rank', 'atomic_3', 'atomic_5', 'Dcom', 'vector_1', 'rank_2', 'eig_first', 'ranked_MI', 'delta_h_bond_group', 'phobic_percent', 'eig_sec', 'sheet_percent', 'gyradius', 'delta_polarity', 'side_chain_length', 'helix_percent', 'delta_side_chain_length', 'ANM_effectiveness-chain', 'rank_1', 'rmsf_overall', 'delta_charge', 'delta_phobic_percent']
gjb2_data = get_test_data(GJB2_path, sel_feats, preprocess_feat, name='GJB2')
ryr1_data = get_test_data(RYR1_path, sel_feats, preprocess_feat, name='RYR1')
gjb2_VUS_ds = TANDEM_Dataset(gjb2_data['VUS'])
gjb2_notVUS_ds = TANDEM_Dataset(gjb2_data['notVUS'])
ryr1_VUS_ds = TANDEM_Dataset(ryr1_data['VUS'])
ryr1_notVUS_ds = TANDEM_Dataset(ryr1_data['notVUS'])

gjb2_VUS_loader = torch.utils.data.DataLoader(gjb2_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
gjb2_notVUS_loader = torch.utils.data.DataLoader(gjb2_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_VUS_loader = torch.utils.data.DataLoader(ryr1_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_notVUS_loader = torch.utils.data.DataLoader(ryr1_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)

for i in range(5):
    writer = SummaryWriter(log_dir=f'logs/{config["experiment_name"]}/fold_{i}')
    train_ds = TANDEM_Dataset(folds[i]['train'])
    val_ds = TANDEM_Dataset(folds[i]['val'])
    test_ds = TANDEM_Dataset(folds[i]['test'])
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=config['train']['batch_size'], shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=config['train']['batch_size'], shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=config['train']['batch_size'], shuffle=False)
    # Estimate the initial bias
    pos = len(folds[i]['train'][folds[i]['train']['label']==0])
    neg = len(folds[i]['train'][folds[i]['train']['label']==1])
    initial_bias = np.log([pos/neg])
    LOGGER.info(f'Initial bias: {initial_bias}')
    train_model(config, train_loader, val_loader, 
                [test_loader, gjb2_notVUS_loader, ryr1_notVUS_loader],
                ['test', 'GJB2', 'RYR1'],
                writer, output_bias=initial_bias)
    writer.close()


In [None]:
from src.train.split_data import get_data
from src.utils.settings import ROOT_DIR
from src.train.dataset import TANDEM_Dataset, get_test_data
from src.train.train import train_model
from torch.utils.tensorboard import SummaryWriter
import torch
from prody import LOGGER
import os 
import torch
config = {
    'model': {
        'input_shape': 33,
        'n_hidden': 6,
        'output_shape': 2,
        'dropout': 0.,
    },
    'train':{
        'n_epochs': 300,
        'patience': 50,
        'batch_size': 300,
        'lr': 5e-5,
        'l1': 0,
        'l2': 1e-4,
    },
    'experiment_name': 'DNN',
    'seed': 150,
    'model_folder': 'logs/models',
}
os.makedirs(f'logs/{config["experiment_name"]}', exist_ok=True)
LOGGER.start(f'logs/{config["experiment_name"]}/train.log')
# input
savepath = os.path.join(ROOT_DIR, 'data/R20000/images')
feat_path = os.path.join(ROOT_DIR, 'data/R20000/features_withNEWpfam_25Mar25.csv')
clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
GJB2_path = os.path.join(ROOT_DIR, 'data/GJB2/features_25Mar27.csv')
RYR1_path = os.path.join(ROOT_DIR, 'data/RYR1/features_25Mar26.csv')
# selected features
sel_feats = ['GNM_co_rank_reduced', 'ANM_stiffness_reduced', 'GNM_V1_reduced', 
'wtBJCE', 'GNM_V2_reduced', 'GNM_SEall_reduced', 'GNM_rankV2_reduced', 
'GNM_rankV1_reduced', 'GNM_SE20_reduced', 'GNM_Eigval2_reduced', 
'GNM_Eigval5_1_reduced', 'deltaBJCE', 'SASA', 'Dcom', 'loop_percent', 'AG1', 'AG5', 'AG3', 'SSbond', 
'Hbond', 'DELTA_DSS', 'DELTA_Hbond', 'sheet_percent', 'helix_percent', 'IDRs', 
'consurf', 'ACNR', 'wtPSIC', 'deltaPSIC', 'entropy', 'BLOSUM', 'ranked_MI', 'deltaPolarity']
# get data
folds, preprocess_feat = get_data(feat_path, clstr_path, sel_feats=sel_feats,folder=savepath)
gjb2_data = get_test_data(GJB2_path, sel_feats, preprocess_feat, name='GJB2')
ryr1_data = get_test_data(RYR1_path, sel_feats, preprocess_feat, name='RYR1')
gjb2_VUS_ds = TANDEM_Dataset(gjb2_data['VUS'])
gjb2_notVUS_ds = TANDEM_Dataset(gjb2_data['notVUS'])
ryr1_VUS_ds = TANDEM_Dataset(ryr1_data['VUS'])
ryr1_notVUS_ds = TANDEM_Dataset(ryr1_data['notVUS'])

gjb2_VUS_loader = torch.utils.data.DataLoader(gjb2_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
gjb2_notVUS_loader = torch.utils.data.DataLoader(gjb2_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_VUS_loader = torch.utils.data.DataLoader(ryr1_VUS_ds, batch_size=config['train']['batch_size'], shuffle=False)
ryr1_notVUS_loader = torch.utils.data.DataLoader(ryr1_notVUS_ds, batch_size=config['train']['batch_size'], shuffle=False)

from src.train.train import train_model
for i in range(5):
    writer = SummaryWriter(log_dir=f'logs/{config["experiment_name"]}/fold_{i}')
    train_ds = TANDEM_Dataset(folds[i]['train'])
    val_ds = TANDEM_Dataset(folds[i]['val'])
    test_ds = TANDEM_Dataset(folds[i]['test'])
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=config['train']['batch_size'], shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=config['train']['batch_size'], shuffle=False)
    test_loader = torch.utils.data.DataLoader(test_ds, batch_size=config['train']['batch_size'], shuffle=False)
    # Estimate the initial bias
    pos = len([folds[i]['train'][1]==0])
    neg = len([folds[i]['train'][1]==1])
    initial_bias = torch.tensor([pos/neg], dtype=torch.float)
    torch.manual_seed(config['seed']) # random seed
    device = "cpu"
    device = torch.device(device)
    train_model(config, train_loader, val_loader, [test_loader, gjb2_notVUS_loader, ryr1_notVUS_loader],
                ['test', 'GJB2', 'RYR1'], writer, output_bias=initial_bias)

In [None]:
from src.train.run import get_seed, use_all_gpus, get_config, build_model
from src.train.modules import build_optimizer
import tensorflow as tf
n_hidden=6 ; patience=50
n_feats = 33
input_shape = 33
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)
model = build_model(cfg)

optimizer = build_optimizer(cfg)
model.compile(optimizer=optimizer, loss=cfg.training.loss, 
                    metrics=[
                        tf.keras.metrics.Accuracy(name='accuracy'),
                            tf.keras.metrics.AUC(name='auc'), 
                            tf.keras.metrics.Precision(name='precision'), 
                            tf.keras.metrics.Recall(name='recall')])

In [None]:
# from src.train.split_data import get_data
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
from prody import LOGGER
import os 
import logging
import datetime

ryr1 = 'data/RYR1/fReplaceDYN_25Mar19.csv'
gjb2 = 'data/GJB2/fReplaceDYN_25Mar18.csv'
r20000 = 'data/R20000/fReplaceDYN_25Mar19.csv'


ryr1 = 'data/RYR1/oldfReplaceDYNandPfam_25Mar19.csv'
gjb2 = 'data/GJB2/oldfReplaceDYNandPfam_25Mar18.csv'
r20000 = 'data/R20000/oldfReplaceDYN_withNEWpfam_25Mar25.csv'

ryr1 = 'data/RYR1/oldfReplaceDYNandPfam_25Mar19.csv'
gjb2 = 'data/GJB2/oldfReplaceDYNandPfam_25Mar18.csv'
r20000 = 'data/R20000/oldfReplaceDYN_withNEWpfam_25Mar25.csv'


# clstr_path = os.path.join(ROOT_DIR, 'data/c30_clstr_May13_full_rhd.csv')
clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
old_feat_names = ['consurf', 'wt_PSIC', 'Delta_PSIC', 'entropy', 'ACNR', 'sasa', 'BLOSUM', 'stiffness-chain', 'loop_percent', 'atomic_1', 'vector_2', 'co_rank', 'atomic_3', 'atomic_5', 'Dcom', 'vector_1', 'rank_2', 'eig_first', 'ranked_MI', 'delta_h_bond_group', 'phobic_percent', 'eig_sec', 'sheet_percent', 'gyradius', 'delta_polarity', 'side_chain_length', 'helix_percent', 'delta_side_chain_length', 'ANM_effectiveness-chain', 'rank_1', 'rmsf_overall', 'delta_charge', 'delta_phobic_percent']
new_feat_names = 
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'improve-{current_time}-seed-{seed}-n_hidden-{n_hidden}')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
n_feats = 33
logging.error("Number of features: %d", n_feats)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, new_feat_names)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat)
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)

input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)

In [None]:
import os
import pandas as pd
from tensorflow.keras.models import load_model
from modules import np_to_dataset
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
x_train_1 = folds[0]['train']['x']
x_test = folds[0]['test']['x']
y_test = folds[0]['test']['y']
test_ds = np_to_dataset(x_test, y_test, shuffle=False, batch_size=300)

df_x_train_1 = pd.DataFrame(x_train_1, columns=feat_names)
df_x_test = pd.DataFrame(x_test, columns=feat_names)

model_path = './logs/thesis-Optimization/improve-20240706-2033-seed-150/n_hidden_6' #model_fold_1.h5'
models = [load_model(os.path.join(model_path, f'model_fold_{i}.h5')) for i in range(1, 6)]

# Make prediction
y_pred = models[0].predict(x_test)

y_test_indicator = np.argmax(y_test, axis=1)
y_pred_indicator = np.argmax(y_pred, axis=1)
print(f'F1 score: {f1_score(y_test_indicator, y_pred_indicator)}')
print(f'Accuracy: {accuracy_score(y_test_indicator, y_pred_indicator)}')

## Change all features

In [None]:
# from src.train.split_data import get_data
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
import os 
import logging
import datetime
import pandas as pd
ryr1 = 'data/RYR1/features_withNEWpfam_25Mar26.csv'
gjb2 = 'data/GJB2/features_withNEWpfam_25Mar26.csv'
r20000 = 'data/R20000/features_withNEWpfam_25Mar26.csv'

clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
new_feat_names = ['consurf', 'wtPSIC', 'deltaPSIC', 'entropy', 'ACNR', 'SASA', 'BLOSUM', 'ANM_stiffness_reduced', 'loop_percent', 'AG1', 'GNM_V2_reduced', 'GNM_co_rank_reduced', 'AG3', 'AG5', 'Dcom', 'GNM_V1_reduced', 'GNM_rankV2_reduced', 'GNM_Eigval1_reduced', 'ranked_MI', 'DELTA_Hbond', 'phobic_percent', 'GNM_Eigval2_reduced', 'sheet_percent', 'Rg', 'deltaPolarity', 'Lside', 'helix_percent', 'deltaLside', 'ANM_effectiveness_reduced', 'GNM_rankV1_reduced', 'GNM_rmsf_overall_reduced', 'deltaCharge', 'delta_phobic_percent']
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'{current_time}-seed-{seed}-n_hidden-{n_hidden}')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
n_feats = 33
logging.error("Number of features: %d", n_feats)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=new_feat_names)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat)
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)

input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)

# Change ranking

In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
from prody import LOGGER
import os 
import logging
import datetime
import pandas as pd
ryr1 = 'data/RYR1/features_withNEWpfam_25Mar26.csv'
gjb2 = 'data/GJB2/features_withNEWpfam_25Mar26.csv'
r20000 = 'data/R20000/features_withNEWpfam_25Mar26.csv'

clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
new_feat_names = ['GNM_co_rank_reduced', 'ANM_stiffness_reduced', 'GNM_V1_reduced', 
'wtBJCE', 'GNM_V2_reduced', 'GNM_SEall_reduced', 'GNM_rankV2_reduced', 
'GNM_rankV1_reduced', 'GNM_SE20_reduced', 'GNM_Eigval2_reduced', 
'GNM_Eigval5_1_reduced', 'deltaBJCE', 'SASA', 'Dcom', 'loop_percent', 'AG1', 'AG5', 'AG3', 'SSbond', 
'Hbond', 'DELTA_DSS', 'DELTA_Hbond', 'sheet_percent', 'helix_percent', 'IDRs', 
'consurf', 'ACNR', 'wtPSIC', 'deltaPSIC', 'entropy', 'BLOSUM', 'ranked_MI', 'deltaPolarity']
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'New_ranking_33feats-{current_time}-seed-{seed}-n_hidden-{n_hidden}')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
n_feats = 33
logging.error("Number of features: %d", n_feats)
logging.error("Feature names: %s", new_feat_names)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=new_feat_names)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat)
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)

input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience, dropout_rate=0.2)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)

# SEQ-only model

In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
# from prody import LOGGER
import os 
import logging
import datetime
import pandas as pd
ryr1 = 'data/RYR1/features_withNEWpfam_25Mar26.csv'
gjb2 = 'data/GJB2/features_withNEWpfam_25Mar26.csv'
r20000 = 'data/R20000/features_withNEWpfam_25Mar26.csv'

clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
new_feat_names = ['GNM_co_rank_reduced', 'ANM_stiffness_reduced', 'GNM_V1_reduced', 
'wtBJCE', 'GNM_V2_reduced', 'GNM_SEall_reduced', 'GNM_rankV2_reduced', 
'GNM_rankV1_reduced', 'GNM_SE20_reduced', 'GNM_Eigval2_reduced', 
'GNM_Eigval5_1_reduced', 'deltaBJCE', 'SASA', 'Dcom', 'loop_percent', 'AG1', 'AG5', 'AG3', 'SSbond', 
'Hbond', 'DELTA_DSS', 'DELTA_Hbond', 'sheet_percent', 'helix_percent', 'IDRs', 
'consurf', 'ACNR', 'wtPSIC', 'deltaPSIC', 'entropy', 'BLOSUM', 'ranked_MI', 'deltaPolarity']
fns = ['BLOSUM', 'phobic_percent', 'delta_phobic_percent', 'charge', 'deltaCharge', 'charge_pH7', 'DELTA_charge_pH7', 
       'polarity', 'deltaPolarity', 'entropy', 'ranked_MI', 'consurf', 'wtPSIC', 'deltaPSIC', 'ACNR']
fns = fns+['SASA', 'GNM_co_rank_reduced', 'ANM_stiffness_reduced', 'Dcom', 'loop_percent', 'AG1', 'AG5', 'GNM_V1_reduced', 'AG3', 'wtBJCE', 'DELTA_Hbond', 'GNM_V2_reduced', 'sheet_percent', 'SSbond', 'DELTA_DSS', 'Hbond', 'GNM_SE20_reduced', 'deltaBJCE']
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'RemoveP00451-{current_time}-seed-{seed}-n_hidden-{n_hidden}')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
n_feats = 33
logging.error("Number of features: %d", n_feats)
# logging.error("Feature names: %s", fns)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=fns)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat) 
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)
input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)#, dropout_rate=0.2)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)

In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
# from prody import LOGGER
import os 
import logging
import datetime
import pandas as pd
ryr1 = 'data/RYR1/features_withNEWpfam_25Mar26.csv'
gjb2 = 'data/GJB2/features_withNEWpfam_25Mar26.csv'
r20000 = 'data/R20000/features_withNEWpfam_25Mar26.csv'
clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')

fns = ['BLOSUM', 'phobic_percent', 'delta_phobic_percent', 'charge', 'deltaCharge', 'charge_pH7', 'DELTA_charge_pH7', 
       'polarity', 'deltaPolarity', 'entropy', 'ranked_MI', 'consurf', 'wtPSIC', 'deltaPSIC', 'ACNR']
fns = fns+['SASA', 'GNM_co_rank_reduced', 'ANM_stiffness_reduced', 'Dcom', 'loop_percent', 'AG1', 'AG5', 'GNM_V1_reduced', 'AG3', 'wtBJCE', 'DELTA_Hbond', 'GNM_V2_reduced', 'sheet_percent', 'SSbond', 'DELTA_DSS', 'Hbond', 'GNM_SE20_reduced', 'deltaBJCE']
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, feat_names=fns)

In [None]:
df = pd.read_csv(r20000)
df[df['SAV_coords'].str.contains('P04637')].labels.value_counts()

In [None]:
import tensorflow as tf

class rDNN(tf.keras.Model):
    """
    This class defines the rDNN model in TensorFlow, equivalent to the PyTorch version.
    """
    def __init__(self, input_shape=33, n_hidden=6, output_shape=2, dropout=0.2, output_bias=None):
        super(rDNN, self).__init__()

        hidden_shape = 33
        hidden_last_shape = 10

        self.hidden_layers = []
        self.hidden_layers.append(tf.keras.layers.Dense(hidden_shape, input_shape=(input_shape,)))
        
        for _ in range(n_hidden - 1):
            self.hidden_layers.append(tf.keras.layers.Dense(hidden_shape))
        
        self.hidden_layers.append(tf.keras.layers.Dense(hidden_last_shape))

        # Output layer
        if output_bias is not None:
            output_bias_initializer = tf.keras.initializers.Constant(output_bias)
        else:
            output_bias_initializer = 'zeros'

        self.output_layer = tf.keras.layers.Dense(output_shape, bias_initializer=output_bias_initializer)
        
        self.gelu = tf.keras.activations.gelu
        self.dropout = tf.keras.layers.Dropout(dropout)
        self.softmax = tf.keras.layers.Softmax(axis=-1)

    def call(self, inputs, training=False):
        x = inputs
        for layer in self.hidden_layers:
            x = layer(x)
            x = self.gelu(x)
            x = self.dropout(x, training=training)
        x = self.output_layer(x)
        return self.softmax(x)

model = rDNN(input_shape=33, n_hidden=5, output_shape=2, dropout=0.2)

x = tf.random.normal((1, 33))
y = model(x)

# Rename

In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
import os 
import logging
import datetime

r20000 = 'data/R20000/feat_rename_and_recal_otherfeatures_features_24Sep29.csv'
gjb2 = 'data/GJB2/features_25Apr01.csv'
ryr1 = 'data/RYR1/feat_rename_and_recal_otherfeatures_features_24May20.csv'

clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
fns = [
    "consurf", "wtPSIC", "deltaPSIC", "entropy", "ACNR", "SASA", "BLOSUM", "ANM_stiffness_chain",
    "loop_percent", "AG1", "GNM_V2_full", "GNM_co_rank_full", "AG3", "AG5", "Dcom", "GNM_V1_full",
    "GNM_rankV2_full", "GNM_Eigval1_full", "ranked_MI", "DELTA_Hbond", "phobic_percent", "GNM_Eigval2_full",
    "sheet_percent", "Rg", "deltaPolarity", "Lside", "helix_percent", "deltaLside", "ANM_effectiveness_chain",
    "GNM_rankV1_full", "GNM_rmsf_overall_full", "deltaCharge", "delta_phobic_percent"]
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'testxongxoa')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
n_feats = 33
logging.error("Number of features: %d", n_feats)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=fns)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat) 
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)

input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)#, dropout_rate=0.2)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)


In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
import os 
import logging
import datetime

r20000 = 'data/R20000/feat_rename_and_recal_otherfeatures_features_24Sep29.csv'
gjb2 = 'data/GJB2/features_25Apr01.csv'
ryr1 = 'data/RYR1/feat_rename_and_recal_otherfeatures_features_24May20.csv'


clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
fns = [
    "consurf", "wtPSIC", "deltaPSIC", "entropy", "ACNR", "SASA", "BLOSUM", "ANM_stiffness_chain",
    "loop_percent", "AG1", "GNM_V2_full", "GNM_co_rank_full", "AG3", "AG5", "Dcom", "GNM_V1_full",
    "GNM_rankV2_full", "GNM_Eigval1_full", "ranked_MI", "DELTA_Hbond", "phobic_percent", "GNM_Eigval2_full",
    "sheet_percent", "Rg", "deltaPolarity", "Lside", "helix_percent", "deltaLside", "ANM_effectiveness_chain",
    "GNM_rankV1_full", "GNM_rmsf_overall_full", "deltaCharge", "delta_phobic_percent"]
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'recal_{current_time}_differentDroprates')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
logging.error(f"r20000: {r20000}")
logging.error(f"gjb2: {gjb2}")
logging.error(f"ryr1: {ryr1}")
logging.error(f"clstr_path: {clstr_path}")
logging.error(f"fns: {fns}")
logging.error(f"seed: {seed}")
logging.error("Description: Recalculate all features and use the new features to train the model")
logging.error("Experiment with different drop rates")
logging.error("*"*50)
n_feats = 33
logging.error("Number of features: %d", n_feats)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=fns)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat) 
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)
input_shape = R20000[2].shape[1]

drop_rates = [0.0, 0.2, 0.4, 0.6, 0.8]
for drop_rate in drop_rates:
    cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience, dropout_rate=drop_rate)
    fold_dir = os.path.join(log_dir, f'dropout_rate-{drop_rate}')
    os.makedirs(fold_dir, exist_ok=True)
    train_model(folds, cfg, fold_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)


In [None]:
from src.utils.settings import ROOT_DIR
from src.train.run import  get_GJB2, get_RYR1, get_data, train_model, get_seed, use_all_gpus, get_config
import os 
import logging
import datetime

r20000 = 'data/R20000/feat_rename_and_recal_otherfeatures_features_24Sep29.csv'
gjb2 = 'data/GJB2/features_25Apr01.csv'
ryr1 = 'data/RYR1/feat_rename_and_recal_otherfeatures_features_24May20.csv'


clstr_path = os.path.join(ROOT_DIR, 'data/old/c30_clstr_May13.csv')
fns = [
    "consurf", "wtPSIC", "deltaPSIC", "entropy", "ACNR", "SASA", "BLOSUM", "ANM_stiffness_chain",
    "loop_percent", "AG1", "GNM_V2_full", "GNM_co_rank_full", "AG3", "AG5", "Dcom", "GNM_V1_full",
    "GNM_rankV2_full", "GNM_Eigval1_full", "ranked_MI", "DELTA_Hbond", "phobic_percent", "GNM_Eigval2_full",
    "sheet_percent", "Rg", "deltaPolarity", "Lside", "helix_percent", "deltaLside",
    "GNM_rankV1_full", "GNM_rmsf_overall_full", "deltaCharge", 'wtBJCE', 'deltaBJCE']
seed = get_seed(seed=150)
##################### 1. Set up logging and experiment name #####################
NAME_OF_EXPERIMENT = 'test'
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M")
n_hidden=6 ; patience=50
log_dir = os.path.join('logs', NAME_OF_EXPERIMENT, f'recal_{current_time}_replace_EffanddeltaPhob_withBJCE')
os.makedirs(log_dir, exist_ok=True)
logging.basicConfig(filename=f'{log_dir}/log.txt', level=logging.ERROR, format='%(message)s')
logging.error(f"r20000: {r20000}")
logging.error(f"gjb2: {gjb2}")
logging.error(f"ryr1: {ryr1}")
logging.error(f"clstr_path: {clstr_path}")
logging.error(f"seed: {seed}")
logging.error("Description: Recalculate all features and use the new features to train the model")
logging.error("Description: Replace Effectiveness and delta_phobic_percent with wtBJCE and deltaBJCE")
# logging.error("Experiment with different drop rates")
logging.error("*"*50)
n_feats = 33
logging.error("Number of features: %d", n_feats)
use_all_gpus()
folds, R20000, preprocess_feat, feat_names = get_data(r20000, clstr_path, n_feats=n_feats, feat_names=fns)
GJB2_knw, GJB2_unk = get_GJB2(gjb2, feat_names, preprocess_feat) 
RYR1_knw, RYR1_unk = get_RYR1(ryr1, feat_names, preprocess_feat)
input_shape = R20000[2].shape[1]
cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience)
train_model(folds, cfg, log_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)

# drop_rates = [0.0, 0.2, 0.4, 0.6, 0.8]
# for drop_rate in drop_rates:
#     cfg = get_config(input_shape, n_hidden=n_hidden, patience=patience, dropout_rate=drop_rate)
#     fold_dir = os.path.join(log_dir, f'dropout_rate-{drop_rate}')
#     os.makedirs(fold_dir, exist_ok=True)
#     train_model(folds, cfg, fold_dir, GJB2_knw, GJB2_unk, RYR1_knw, RYR1_unk, seed=seed)
