<a href="https://colab.research.google.com/github/npradeep96/IDRBlocks/blob/master/IDR_model_domain_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount data
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/Colab Notebooks/ML_for_IDRs'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/ML_for_IDRs


In [None]:
import pandas as pd
! pip install biopython
from Bio import SeqIO
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import itertools
from tqdm import tqdm



In [None]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
torch.set_float32_matmul_precision(precision='medium')

!pip install lightning
import lightning as L
from lightning.pytorch import LightningModule
from lightning.pytorch.loggers import CSVLogger

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score, r2_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix



In [None]:
!pip install optuna
!pip install optuna-integration
import optuna
from optuna_integration import PyTorchLightningPruningCallback



In [None]:
import matplotlib
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rc('lines', linewidth=3, color='g')
matplotlib.rcParams['axes.linewidth'] = 2.0
matplotlib.rcParams['axes.linewidth'] = 2.0
matplotlib.rcParams["xtick.major.size"] = 6
matplotlib.rcParams["ytick.major.size"] = 6
matplotlib.rcParams["ytick.major.width"] = 2
matplotlib.rcParams["xtick.major.width"] = 2
matplotlib.rcParams['text.usetex'] = False

## Make pandas dataset for IDR sequence-derived features and run a Neural Network for classification

In [None]:
data_matrix_idr_features = pd.read_csv('/content/drive/My Drive/Colab Notebooks/ML_for_IDRs/data_matrix.csv')
data_matrix_idr_features.head(10)


Unnamed: 0,Uniprot ID,IDR Sequence List,IDR Sequence Combined,IDR Count,Total IDR Length,Fraction Positive,Fraction Negative,Fraction Expanding,FCR,NCPR,Kappa,Omega,Isoelectric Point,Uversky Hydropathy,PPII Propensity,Delta,Delta Max,SCD,Log Partition Ratios
0,Q8WUM0,['FPAAPSPRTPGTGSRRGPLAGLGPGSTPRTASRKGLPL'],FPAAPSPRTPGTGSRRGPLAGLGPGSTPRTASRKGLPL,1.0,38.0,0.157895,0.0,0.368379,0.157852,0.157852,0.242429,0.053462,14.499987,0.433333,0.426842,0.023065,0.095141,1.334377,10.371906
1,Q9NVP1,['NLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQ...,NLTLSETQNGDVSEETMGSRKVKKSKQKPMNVGLSETQNGGMSQEA...,1.0,138.0,0.181159,0.181159,0.405402,0.361924,-0.000133,0.402122,0.181223,7.0,0.340419,0.391667,0.139929,0.347977,-3.062043,9.565771
2,Q8N7H5,['DKSGSGEDESSEDEARAARDKEEIFGSDADSEDDADSDDEDRGQ...,DKSGSGEDESSEDEARAARDKEEIFGSDADSEDDADSDDEDRGQAQ...,1.0,102.0,0.078431,0.313725,0.414803,0.395195,-0.231872,0.25372,0.121241,3.609375,0.31427,0.31598,0.070581,0.278184,14.80604,9.189393
3,Q8WUA4,['SPGQEVLNQLDVKTSSEMTSAEASVEMSLPTPLPGFEDSPDQRR...,SPGQEVLNQLDVKTSSEMTSAEASVEMSLPTPLPGFEDSPDQRRLP...,2.0,255.0,0.164706,0.14902,0.479416,0.31471,0.015757,0.320499,0.070002,8.75,0.371373,0.448118,0.097659,0.30471,-3.284487,9.121233
4,Q8WVC0,['ADMEDLFGSDADSEAERKDSDSGSDSDSDQENAASGSNASGSES...,ADMEDLFGSDADSEAERKDSDSGSDSDSDQENAASGSNASGSESDQ...,3.0,456.0,0.120614,0.346491,0.509185,0.471904,-0.220657,0.226271,0.102102,3.882812,0.276827,0.36057,0.084354,0.372802,127.322525,9.007168
5,P55884,['QDAENVAVPEAAEERAEPGQQQPAAEPPPAEGLLRPAGPGAPEA...,QDAENVAVPEAAEERAEPGQQQPAAEPPPAEGLLRPAGPGAPEAAG...,1.0,157.0,0.050955,0.248408,0.446737,0.30024,-0.196211,0.111375,0.037347,3.5,0.389314,0.435223,0.024696,0.221737,21.077413,8.627768
6,Q9Y5Q8,['LFSSSAKADGGKEQLTYESGEDEEDEEEEEEEEEDFKPSDGSEN...,LFSSSAKADGGKEQLTYESGEDEEDEEEEEEEEEDFKPSDGSENEM...,1.0,54.0,0.055556,0.444444,0.518157,0.499638,-0.388703,0.604509,0.30025,3.28125,0.328189,0.34963,0.149423,0.247181,11.883401,7.99708
7,P55081,"['SVPSALMKQPPIQSTAGAVPVRNEKG', 'VSGKRPDYAPMESS...",SVPSALMKQPPIQSTAGAVPVRNEKGVSGKRPDYAPMESSDEEDEE...,2.0,178.0,0.146067,0.331461,0.528732,0.47817,-0.18431,0.343609,0.075862,3.9375,0.327778,0.392191,0.138866,0.404138,17.644652,7.71333
8,Q9UBU9,['ADEGKSYSEHDDERVNFPQRKKKGRGPFRWKYGEGNRRSGRGGS...,ADEGKSYSEHDDERVNFPQRKKKGRGPFRWKYGEGNRRSGRGGSGI...,1.0,84.0,0.22619,0.178571,0.46612,0.406597,0.049555,0.32554,0.053614,10.0625,0.298942,0.354048,0.123157,0.378316,-1.146155,7.690745
9,Q8IX01,"['SGSVAHSRDAGREGLRSDVFPGPSFRSSNPSI', 'RATTGTQT...",SGSVAHSRDAGREGLRSDVFPGPSFRSSNPSIRATTGTQTLLSSGT...,3.0,198.0,0.070707,0.20202,0.417808,0.276394,-0.12834,0.29006,0.155521,3.9375,0.390685,0.405758,0.066557,0.229458,4.407218,7.650883


In [None]:
data_matrix_idr_features.describe()

Unnamed: 0,IDR Count,Total IDR Length,Fraction Positive,Fraction Negative,Fraction Expanding,FCR,NCPR,Kappa,Omega,Isoelectric Point,Uversky Hydropathy,PPII Propensity,Delta,Delta Max,SCD,Log Partition Ratios
count,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0,1832.0
mean,2.293668,190.222162,0.143337,0.155362,0.416545,0.301813,-0.009218,0.248728,0.170126,7.562398,0.355894,0.40823,0.059863,0.242669,3.667438,0.252371
std,2.106919,267.544894,0.082754,0.100643,0.105166,0.128833,0.131495,0.147247,0.121777,3.204692,0.053241,0.050111,0.042068,0.118429,24.864038,2.98108
min,1.0,25.0,0.0,0.0,0.052521,0.0,-0.759432,-1.0,0.005422,1.75,0.143928,0.183107,0.0,0.0,-55.427329,-6.835415
25%,1.0,48.0,0.084389,0.086957,0.35225,0.213889,-0.064195,0.170169,0.101439,4.375,0.325178,0.379016,0.030376,0.16139,-0.727035,-1.862596
50%,2.0,103.0,0.132796,0.137931,0.413697,0.290191,0.002304,0.226925,0.144121,7.0,0.360875,0.405771,0.049537,0.234057,0.05679,0.119152
75%,3.0,226.0,0.1875,0.2,0.479737,0.375887,0.058752,0.30818,0.204779,10.0625,0.391704,0.435684,0.078264,0.310657,2.041941,2.414201
max,24.0,3108.0,0.514286,0.76,0.836311,0.836311,0.463884,1.0,1.192246,14.499987,0.534156,0.6716,0.349907,0.818542,839.796075,10.371906


In [None]:
# Convert IDR counts to log2 IDR counts
data_matrix_idr_features['log2 IDR Count'] = np.log2(data_matrix_idr_features['IDR Count'])

# Convert total length of IDR to log10 length of IDR
data_matrix_idr_features['log10 Total IDR Length'] = np.log10(data_matrix_idr_features['Total IDR Length'])

# Drop the sequences that have absurdly high SCD
data_matrix_idr_features = data_matrix_idr_features[(data_matrix_idr_features['SCD']>-25.0)*(data_matrix_idr_features['SCD']<25.0)]

# Select only the protein sequences that have a partition ratio of > 2 or < -2 for classification
def assign_labels(pr):
  if pr > 1.0:
    return 1
  elif pr < -1.0:
    return 0
  else:
    return -1

data_matrix_idr_features['Enrichment Label'] = data_matrix_idr_features['Log Partition Ratios'].apply(assign_labels)

# Drop the columns that contain Enrichment Label of -1
data_matrix_idr_features = data_matrix_idr_features[data_matrix_idr_features['Enrichment Label'] != -1]

# Derive a dataset from above for training a neural network
columns = ['Uniprot ID', 'log2 IDR Count', 'log10 Total IDR Length', 'Fraction Positive',
           'Fraction Negative', 'Fraction Expanding', 'FCR', 'NCPR', 'Kappa',
           'Omega', 'Isoelectric Point', 'Uversky Hydropathy', 'PPII Propensity',
           'Delta', 'Delta Max', 'SCD', 'Enrichment Label']

data_matrix_nn = data_matrix_idr_features[columns]

data_matrix_nn.head(10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_matrix_idr_features['Enrichment Label'] = data_matrix_idr_features['Log Partition Ratios'].apply(assign_labels)


Unnamed: 0,Uniprot ID,log2 IDR Count,log10 Total IDR Length,Fraction Positive,Fraction Negative,Fraction Expanding,FCR,NCPR,Kappa,Omega,Isoelectric Point,Uversky Hydropathy,PPII Propensity,Delta,Delta Max,SCD,Enrichment Label
0,Q8WUM0,0.0,1.579784,0.157895,0.0,0.368379,0.157852,0.157852,0.242429,0.053462,14.499987,0.433333,0.426842,0.023065,0.095141,1.334377,1
1,Q9NVP1,0.0,2.139879,0.181159,0.181159,0.405402,0.361924,-0.000133,0.402122,0.181223,7.0,0.340419,0.391667,0.139929,0.347977,-3.062043,1
2,Q8N7H5,0.0,2.0086,0.078431,0.313725,0.414803,0.395195,-0.231872,0.25372,0.121241,3.609375,0.31427,0.31598,0.070581,0.278184,14.80604,1
3,Q8WUA4,1.0,2.40654,0.164706,0.14902,0.479416,0.31471,0.015757,0.320499,0.070002,8.75,0.371373,0.448118,0.097659,0.30471,-3.284487,1
5,P55884,0.0,2.1959,0.050955,0.248408,0.446737,0.30024,-0.196211,0.111375,0.037347,3.5,0.389314,0.435223,0.024696,0.221737,21.077413,1
6,Q9Y5Q8,0.0,1.732394,0.055556,0.444444,0.518157,0.499638,-0.388703,0.604509,0.30025,3.28125,0.328189,0.34963,0.149423,0.247181,11.883401,1
7,P55081,1.0,2.25042,0.146067,0.331461,0.528732,0.47817,-0.18431,0.343609,0.075862,3.9375,0.327778,0.392191,0.138866,0.404138,17.644652,1
8,Q9UBU9,0.0,1.924279,0.22619,0.178571,0.46612,0.406597,0.049555,0.32554,0.053614,10.0625,0.298942,0.354048,0.123157,0.378316,-1.146155,1
9,Q8IX01,1.584963,2.296665,0.070707,0.20202,0.417808,0.276394,-0.12834,0.29006,0.155521,3.9375,0.390685,0.405758,0.066557,0.229458,4.407218,1
10,O75643,0.0,1.477121,0.4,0.233333,0.671853,0.638519,0.172145,0.226899,0.255478,11.8125,0.184815,0.414667,0.122409,0.539486,0.211554,1


In [None]:
data_matrix_nn.describe()

Unnamed: 0,log2 IDR Count,log10 Total IDR Length,Fraction Positive,Fraction Negative,Fraction Expanding,FCR,NCPR,Kappa,Omega,Isoelectric Point,Uversky Hydropathy,PPII Propensity,Delta,Delta Max,SCD,Enrichment Label
count,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0
mean,0.795535,2.014308,0.142208,0.153968,0.415018,0.299294,-0.008926,0.249379,0.172773,7.515572,0.356876,0.408399,0.059484,0.241428,1.06123,0.520455
std,0.917237,0.416566,0.081166,0.096956,0.105361,0.128098,0.124422,0.149318,0.124985,3.168552,0.053399,0.049985,0.043214,0.121085,4.98051,0.499771
min,0.0,1.39794,0.0,0.0,0.07998,0.0,-0.759432,-1.0,0.00668,1.75,0.143928,0.183107,0.0,0.0,-23.75916,0.0
25%,0.0,1.662758,0.084803,0.087433,0.35098,0.213453,-0.0635,0.168574,0.10126,4.375,0.325468,0.379663,0.029184,0.15745,-0.753249,0.0
50%,1.0,1.963788,0.133333,0.136083,0.413243,0.286432,0.001815,0.228308,0.145539,7.0,0.361325,0.406561,0.049011,0.228622,0.027689,1.0
75%,1.584963,2.318584,0.186956,0.2,0.477872,0.370775,0.056232,0.309128,0.206853,10.0625,0.392221,0.435957,0.0775,0.310027,1.735813,1.0
max,4.0,3.354685,0.514286,0.76,0.836311,0.836311,0.463884,1.0,0.959474,14.499987,0.534156,0.6108,0.349907,0.818542,24.96679,1.0


In [None]:
# Scale the data from

X = data_matrix_nn[columns[1:-1]].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X=X)
X_scaled = pd.DataFrame(X_scaled, columns=columns[1:-1])
X_scaled.describe()

Unnamed: 0,log2 IDR Count,log10 Total IDR Length,Fraction Positive,Fraction Negative,Fraction Expanding,FCR,NCPR,Kappa,Omega,Isoelectric Point,Uversky Hydropathy,PPII Propensity,Delta,Delta Max,SCD
count,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0,1320.0
mean,-9.689219000000001e-17,-5.275242e-16,4.30632e-17,4.30632e-17,4.629294e-16,-1.722528e-16,-1.345725e-18,1.507212e-16,9.689219000000001e-17,9.823792000000001e-17,5.490558e-16,4.30632e-17,-1.937844e-16,6.459479e-17,-1.07658e-17
std,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379,1.000379
min,-0.8676455,-1.4802,-1.752724,-1.588618,-3.181112,-2.337335,-6.034226,-8.370418,-1.329407,-1.820313,-3.989417,-4.508856,-1.377028,-1.994621,-4.985393
25%,-0.8676455,-0.8442436,-0.7075248,-0.6865038,-0.6080232,-0.6703703,-0.4387838,-0.5413648,-0.5723862,-0.991545,-0.5884062,-0.5750971,-0.7014382,-0.6938079,-0.3644542
50%,0.2229979,-0.121324,-0.1093776,-0.1845362,-0.01685367,-0.1004417,0.08636145,-0.1411693,-0.2179832,-0.1627769,0.08335151,-0.03677559,-0.2424568,-0.1057955,-0.2075958
75%,0.8609834,0.7307146,0.5515233,0.4749482,0.5967844,0.5582362,0.523889,0.4002962,0.2727716,0.8041192,0.6621698,0.5515296,0.4170562,0.5667491,0.1354959
max,3.494928,3.218896,4.585899,6.252935,4.000074,4.193838,3.801495,5.028908,6.296729,2.205128,3.321208,4.050739,6.723096,4.767989,4.801641


In [None]:
# Assign data
X_scaled = X_scaled[columns[1:-1]]
y = data_matrix_nn['Enrichment Label'].values

# Do a test:train 20:80 split
X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, train_size=0.8)
# Divide the train set into a validation:train 10:90 split
X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, train_size=0.8)

# Generate dataset
class IDRFeaturesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.Tensor(np.array(X))  # store X as a pytorch Tensor
        self.y = torch.Tensor(np.array(y))  # store y as a pytorch Tensor
        self.len = len(self.X)              # number of samples in the data

    def __getitem__(self, index):
        # your implementation here:
        return self.X[index], self.y[index]
    def __len__(self):
        return self.len

#Build Dataset
train_data = IDRFeaturesDataset(X_train, y_train) # fill in code here
val_data = IDRFeaturesDataset(X_val, y_val) # fill in code here
test_data = IDRFeaturesDataset(X_test, y_test) # fill in code here

# Train model

In [None]:
# Set up model

class IDRFeatureNN(LightningModule) :
    def __init__(self, hidden_dim=16, lr=1e-3, weight_decay=1e-5, dropout_rate=0.5, input_dim=15) :
        super(IDRFeatureNN, self).__init__()
        # Define a MLP regressor
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_dim, 1))

        # Define a sigmoid transform
        self.sigmoid = nn.Sigmoid()
        # Define training parameters
        self.lr = lr
        self.weight_decay = weight_decay

    def forward(self, x):
        # Pass output into a MLP
        mlp_out = self.mlp(x)
        # Transform output into probabilites
        proba = self.sigmoid(mlp_out)
        # Return probabilities
        return proba

    def training_step(self, batch, batch_idx):
        embeddings, labels = batch
        outputs = self(embeddings)
        loss = F.binary_cross_entropy(outputs.squeeze(), labels.squeeze())
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        embeddings, labels = batch
        outputs = self(embeddings)
        loss = F.binary_cross_entropy(outputs.squeeze(), labels.squeeze())
        self.log('val_loss', loss)
        return loss

    def configure_optimizers(self):
        # Add weight decay for L2 regularization
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr,
                                     weight_decay=self.weight_decay)
        return optimizer

In [None]:
def objective(trial):

    # Tune learning rates and regularization hyperparameters
    params = {"dropout_rate": trial.suggest_float("dropout_rate", 0.1, 0.5, log=True),
              "lr": trial.suggest_float("lr", 1e-3, 0.1, log=True),
              "weight_decay": trial.suggest_float("weight_decay", 1e-5, 1e-2, log=True)}
    # Tune max epochs
    max_epochs =  trial.suggest_int("max_epochs", 50, 200)
    # Tune batch size
    log_2_batch_size =  trial.suggest_int("log_2_batch_size", 3, 5)
    # Build DataLoader
    batch_size = 2**log_2_batch_size
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=True)

    ffn = IDRFeatureNN(**params)
    logger = CSVLogger("logs", name="model")

    trainer = L.Trainer(
        logger=logger,
        enable_checkpointing=True,
        callbacks=[L.pytorch.callbacks.ModelCheckpoint(dirpath="checkpoints")],
        enable_progress_bar=False,
        accelerator="gpu",
        devices=1,
        max_epochs=max_epochs, # number of epochs to train for
    )

    hyperparameters = dict(max_epochs=max_epochs, **params)
    trainer.logger.log_hyperparams(hyperparameters)
    trainer.fit(ffn, train_loader, val_loader)  # Define your dataloaders properly

    return trainer.callback_metrics["val_loss"].item()  # Or any other metric that you aim to minimize

# Create a study and execute optimization
pruner = optuna.pruners.PatientPruner(optuna.pruners.HyperbandPruner(), patience=3)
study = optuna.create_study(direction="minimize", pruner=pruner)
study.optimize(objective, n_trials=100)  # You can adjust the number of trials

print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

  pruner = optuna.pruners.PatientPruner(optuna.pruners.HyperbandPruner(), patience=3)
[I 2024-05-19 22:08:03,993] A new study created in memory with name: no-name-b30c4993-9873-43bd-886f-26a2ce7a2c8d
INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:653: Checkpoint directory /content/drive/My Drive/Colab Notebooks/ML_for_IDRs/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RA

Best trial:
  Value: 0.6396729946136475
  Params: 
    dropout_rate: 0.41830168853161537
    lr: 0.0015283655961730448
    weight_decay: 9.455508469596013e-05
    max_epochs: 185
    log_2_batch_size: 5


In [None]:
def train(model, dataloader, optimizer, device):

    '''
    A function to train on the entire dataset for one epoch.

    Args:
        model (torch.nn.Module): Your sequence classifier
        dataloader (torch.utils.data.Dataloader): DataLoader object for the train data
        optimizer (torch.optim.Optimizer): Optimizer object to interface gradient calculation and optimization
        device (str): Your device

    Returns:
        float: loss averaged over all the batches

    '''

    batch_loss = []
    model.train() # Set model to training mode

    for batch in dataloader:
        X, label = batch
        X = X.to(device)
        label = label.to(device)

        # train your model on each batch here
        pred = model(X)
        loss = nn.functional.binary_cross_entropy(pred.squeeze(),label.squeeze())
        batch_loss.append(loss.item())

        # run backpropagation given the loss you defined
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return np.array(batch_loss).mean()


def validate(model, dataloader, device):

    '''
    A function to validate on the validation dataset for one epoch.

    Args:
        model (torch.nn.Module): Your sequence classifier
        dataloader (torch.utils.data.Dataloader): DataLoader object for the validation data
        device (str): Your device

    Returns:
        float: loss averaged over all the batches

    '''

    val_loss = []
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
        for batch in dataloader:
            X, label = batch
            X = X.to(device)
            label = label.to(device)

            # validate your model on each batch here
            pred = model(X)
            loss = nn.functional.binary_cross_entropy(pred.squeeze(),label.squeeze())
            val_loss.append(loss.item())

    return np.array(val_loss).mean()

def evaluate(model, dataloader, device):

    '''
    A function to return the classification probabilities and true labels (for evaluation).

    Args:
        model (torch.nn.Module): your sequence classifier
        dataloader (torch.utils.data.Dataloader): DataLoader object for the train data
        device (str): Your device

    Returns:
        (np.array, np.array): true labels, predicted probabilities
    '''

    pred_prob = []
    labels = []
    with torch.no_grad():
        model.eval()
        for batch in dataloader:
            epoch_loss = []
            X, label = batch

            X = X.to(device)
            label = label.to(device)

            # evaluate your model here
            pred = model(X)
            pred_prob += pred.cpu().numpy().ravel().tolist()
            labels += label.cpu().numpy().ravel().tolist()

    return labels, pred_prob

In [None]:
for hidden_layers in [16]:
  for prob_dropout in [0.2]:
    for lr in [0.01]:

      device = 'cuda:0'
      model = IDRFeatureNN(hidden_dim=hidden_layers, dropout_prob=prob_dropout).to(device)

      optimizer = torch.optim.Adam(list(model.parameters()), lr=lr)
      scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=0.5)

      val_loss_curve = []
      train_loss_curve = []

      # Use tqdm for progress bar
      tqdm_progress =  tqdm(range(500), desc="Progress")

      for epoch in tqdm_progress:
          # Compute train your model on training data
          epoch_loss = train(model, train_loader, optimizer,  device=device)

          # Validate your on validation data
          val_loss = validate(model, val_loader, device=device)

          # Record train and loss performance
          train_loss_curve.append(epoch_loss)
          val_loss_curve.append(val_loss)

          # The learning rate scheduler record the validation loss
          scheduler.step(val_loss)
          tqdm_progress.set_postfix(train_loss=f'{epoch_loss:.3f}', val_loss=f'{val_loss:.3f}', refresh=True)

      print ('-------------------Performance metrics -----------')
      print('Number of hidden layers: ', str(hidden_layers))
      print('Dropout probability: ', str(prob_dropout))
      print('Learning Rate:', str(lr))
      fig, ax = plt.subplots(figsize=(6, 4))
      ax.plot(val_loss_curve, label='Validation Loss')
      ax.plot(train_loss_curve, label='Training Loss')
      ax.set_xlabel('Epoch')
      ax.set_ylabel('Loss')
      ax.legend(loc='upper right')
      fig.tight_layout()
      plt.show()

      ################ Code #################
      labels, pred_prob = evaluate(model, test_loader, device)
      test_score = roc_auc_score(labels, pred_prob)
      ################ Code #################

      print("AUC on the test dataset is {}.".format(test_score) )

      pred_label = np.zeros(len(pred_prob))
      for i in range(len(pred_prob)):
        if pred_prob[i] >= 0.5:
          pred_label[i] = 1
        else:
          pred_label[i] = 0
      conf_matrix = confusion_matrix(labels, pred_label)

      # Print confusion matrix
      print("Confusion Matrix:")
      print(conf_matrix)

      print ('---------------------------------------------')

TypeError: IDRFeatureNN.__init__() got an unexpected keyword argument 'dropout_prob'

In [None]:
# ! pip install shap
import shap

# # Convert numpy arrays to PyTorch tensors
# X_train_tensor = torch.tensor(X_train).float()
# X_test_tensor = torch.tensor(X_test).float()

# # Define a function to get SHAP values from the PyTorch model
# def get_shap_values(input_data):
#     # Create a SHAP explainer using the PyTorch model
#     explainer = shap.Explainer(model, input_data)

#     # Compute SHAP values
#     shap_values = explainer(input_data)
#     return shap_values

# # Compute SHAP values for the training data
# shap_values_train = get_shap_values(X_train_tensor)

# # Training data as dataframe
# df_train = pd.DataFrame(X_train, columns = ['log2 IDR Count', 'log10 Total IDR Length', 'Fraction Positive',
#            'Fraction Negative', 'Fraction Expanding', 'FCR', 'NCPR', 'Kappa',
#            'Omega', 'Isoelectric Point', 'Uversky Hydropathy', 'PPII Propensity',
#            'Delta', 'Delta Max', 'SCD'])

# # Plot the SHAP values
# shap.summary_plot(shap_values_train, df_train)
# plt.show()

# from torch import Variable

# Get features
train_features_df = pd.DataFrame(X_train, columns = ['log2 IDR Count', 'log10 Total IDR Length', 'Fraction Positive',
            'Fraction Negative', 'Fraction Expanding', 'FCR', 'NCPR', 'Kappa',
           'Omega', 'Isoelectric Point', 'Uversky Hydropathy', 'PPII Propensity',
           'Delta', 'Delta Max', 'SCD']) # pandas dataframe
test_features_df = pd.DataFrame(X_test, columns = ['log2 IDR Count', 'log10 Total IDR Length', 'Fraction Positive',
            'Fraction Negative', 'Fraction Expanding', 'FCR', 'NCPR', 'Kappa',
           'Omega', 'Isoelectric Point', 'Uversky Hydropathy', 'PPII Propensity',
           'Delta', 'Delta Max', 'SCD']) # pandas dataframe



# Define function to wrap model to transform data to tensor
f = lambda x: model(torch.Tensor(x).to(device='cuda:0')).detach()

# Convert my pandas dataframe to numpy
data = test_features_df.to_numpy(dtype=np.float32)

# The explainer doesn't like tensors, hence the f function
explainer = shap.KernelExplainer(f, data)

# Get the shap values from my test data
shap_values = explainer.shap_values(data)

# Enable the plots in jupyter
shap.initjs()

feature_names = test_features_df.columns
# Plots
#shap.force_plot(explainer.expected_value, shap_values[0], feature_names)
#shap.dependence_plot("b1_price_avg", shap_values[0], data, feature_names)
shap.summary_plot(shap_values[0], data, feature_names)

In [None]:

for hidden_layers in [8, 16]:
  for lr in [0.001, 0.01]:

    device = 'cuda:0'
    model = IDRFeatureNN_fully_connected(hidden_dim=hidden_layers).to(device)

    optimizer = torch.optim.Adam(list(model.parameters()), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=True, factor=0.5)

    val_loss_curve = []
    train_loss_curve = []

    # Use tqdm for progress bar
    tqdm_progress =  tqdm(range(500), desc="Progress")

    for epoch in tqdm_progress:
        # Compute train your model on training data
        epoch_loss = train(model, train_loader, optimizer,  device=device)

        # Validate your on validation data
        val_loss = validate(model, val_loader, device=device)

        # Record train and loss performance
        train_loss_curve.append(epoch_loss)
        val_loss_curve.append(val_loss)

        # The learning rate scheduler record the validation loss
        scheduler.step(val_loss)
        tqdm_progress.set_postfix(train_loss=f'{epoch_loss:.3f}', val_loss=f'{val_loss:.3f}', refresh=True)

    print ('-------------------Performance metrics -----------')
    print('Number of hidden layers: ', str(hidden_layers))
    print('Dropout probability: ', str(prob_dropout))
    print('Learning Rate:', str(lr))
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.plot(val_loss_curve, label='Validation Loss')
    ax.plot(train_loss_curve, label='Training Loss')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend(loc='upper right')
    fig.tight_layout()
    plt.show()

    ################ Code #################
    labels, pred_prob = evaluate(model, test_loader, device)
    test_score = roc_auc_score(labels, pred_prob)
    ################ Code #################

    print("AUC on the test dataset is {}.".format(test_score) )

    pred_label = np.zeros(len(pred_prob))
    for i in range(len(pred_prob)):
      if pred_prob[i] >= 0.5:
        pred_label[i] = 1
      else:
        pred_label[i] = 0
    conf_matrix = confusion_matrix(labels, pred_label)

    # Print confusion matrix
    print("Confusion Matrix:")
    print(conf_matrix)

    print ('---------------------------------------------')