## Neural Network as a baseline predictor

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.nn import Linear
import torch.nn.functional as F
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, matthews_corrcoef, precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedGroupKFold, ParameterGrid
from torch_geometric.data import Data

In [None]:
visualizations = "../visualizations"
if not os.path.exists(visualizations):
    os.makedirs(visualizations)

### Loading data 

The file structure of ``baseline_features.csv``: drugcomb_sorted, drugA, drugB, m edge features, n node features for drug A, n node features for drug B, label

In [None]:
df = pd.read_csv("../data/baseline_features.csv")

In [None]:
print(len(df.columns))

In [None]:
features = df.iloc[:, 3:len(df.columns)-1] 
X = torch.tensor(features.values)
X = X.to(torch.float)  # convert to float so it's compatible with the neural network
columns = list(df.columns.values)[3:len(df.columns)-1]
print(columns)
y = torch.tensor(df["label"].values)

### Stratified 10-fold cross-validation with train, validation and test sets<a class="anchor" id="crossvalidation"></a>

In [None]:
def kfold_new(train_value, val_value):
    global train, val, test, train_val, train_labels, val_labels, test_labels
    train, val, test, train_val = [], [], [], []
    train_labels, val_labels, test_labels = [], [], []
    
    kf = StratifiedGroupKFold(n_splits=train_value, shuffle=False)
    groups = df["drugcomb_sorted"].to_list() 
    
    # train/val: 80%, test: 20% -> first split
    for i, (train_val_idx, test_idx) in enumerate(kf.split(X, y, groups)): 
        print(f"Fold {i+1}:")
        print(f" Train and Validation: index={train_val_idx[:20]}")  # 80%
        
        train_val_groups = np.array(groups)[train_val_idx.astype(int)]
        train_val_y = df.iloc[train_val_idx]["label"]

        # add the indices and labels
        train_val.append(train_val_idx)
        test.append(test_idx)
        test_labels.append(df.iloc[test_idx]["label"].values)
        
        # train: 60%, val: 20% -> second split
        inner_skf = StratifiedGroupKFold(n_splits=val_value, shuffle=False)  # train: 60%, val: 20%
        train_idx, val_idx = next(inner_skf.split(df.iloc[train_val_idx], train_val_y, train_val_groups))    

        # combine train and validation indies
        arr1, arr2 = train_idx, val_idx
        arr = [*arr1, *arr2]
        arr.sort()

        # create dictionary for the mapping
        list1 = arr  # new index
        list2 = train_val_idx  # old index
        d1 = {}
        for i in range(len(list1)):  # everything: train + val
            d1[list1[i]] = list2[i]

        # convert the new to the original indices 
        old_idx = []   
        old_idx_ = []
        for i in range(len(train_idx)):
            old_idx.append(d1.get(train_idx[i])) 

        for i in range(len(val_idx)):
            old_idx_.append(d1.get(val_idx[i])) 

        # check whether the 3 sets have overlapping elements
        """print("Check for any overlap between train and validation")
        print(list(set(old_idx).intersection(old_idx_)))

        print("Check for any overlap between train and test")
        print(list(set(old_idx).intersection(test_idx)))

        print("Check for any overlap between validation and test")
        print(list(set(old_idx_).intersection(test_idx)))"""
 
        print(f"     Train: index={old_idx[:20]}, length={len(old_idx)}") 
        print(f"     Validation: index={old_idx_[:20]}, length={len(old_idx_)}") 
        
        train.append(old_idx)
        train_labels.append(df.iloc[old_idx]["label"].values)  
        val.append(old_idx_)
        val_labels.append(df.iloc[old_idx_]["label"].values) 

        print(f" Test:  index={test_idx[:20]}, length={len(test_idx)}")  # 20% of the total
        print("*"*100)

# 80% train, 10% val, 10% test
kfold_new(10,9)

In [None]:
def k_fold(n):  # old version with only train and test sets (maybe delete this later?)
    k = 5  
    k_fold = StratifiedGroupKFold(n_splits=k, shuffle=False) 
    groups = df["drugcomb_sorted"].to_list()   # avoid data leakage

    train, val, test = [], [], []
    train_arr = []
    val_arr = []
    test_arr = []
    
    for i, (train_index, test_index) in enumerate(k_fold.split(X, y, groups)):
        print(f"Fold {i+1}:")
        print(f" Train: index={train_index}")
        print(f" Test:  index={test_index}")
        train_features = df.iloc[train_index][columns]
        train_arr.append(df.iloc[train_index]["label"].values)
        test_arr.append(df.iloc[test_index]["label"].values)

#k_fold(5)

### Model architecture and Training 

Both the hyperparameter tuning and training code are similar to the code in ``GNN.ipynb``. 

In [None]:
class MLP(torch.nn.Module):
    def __init__(self, dim_in, dim_h1, dim_h2, dim_out=2):
        super().__init__()
        self.linear1 = Linear(dim_in, dim_h1)
        self.linear2 = Linear(dim_h1, dim_h2)
        self.linear3 = Linear(dim_h2, dim_out)

    def forward(self, x):
        x = self.linear1(x)  
        x = torch.relu(x)    
        x = self.linear2(x)
        x = torch.relu(x)
        x = self.linear3(x)
        x = torch.softmax(x, dim=1)  # return probability of each class
        return x

In [None]:
def fit(model, lr, weight_decay, weighted_loss, epochs=30):
    global train_prediction, val_pred, train_loss, val_loss, train_acc, val_acc, mcc_final
    train_prediction, val_pred = [], []
    train_loss, val_loss, train_acc, val_acc = [], [], [], []

    # weighted loss for imbalanced class
    weights = torch.tensor([1, weighted_loss])  # class 1 is the minority class so it's higher weighted
    weights = weights.to(torch.float)
    loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for i in range(len(train)): 
        print(f"********** Fold {i+1} train data: **********")
        train_features = torch.tensor(df.iloc[train[i]][columns].values)
        train_features = train_features.to(torch.float) 
        train_labels = torch.tensor(df.iloc[train[i]]["label"].values)
        train_labels = train_labels.to(torch.long)

        val_features = torch.tensor(df.iloc[val[i]][columns].values)
        val_features = val_features.to(torch.float) 
        val_labels = torch.tensor(df.iloc[val[i]]["label"].values)
        val_labels = val_labels.to(torch.long)

        for epoch in range(epochs + 1):
            optimizer.zero_grad()
            output = model(train_features)  # the features are the node and edge features
            loss = loss_fn(output, train_labels)  # loss is based on edge labels
            #loss_values.append(loss.detach().numpy()) don't have this

            #prediction.append(output.argmax(1))  # maximum index in each row
            #accuracy = torch.sum(torch.argmax(output, dim=1) == train_label) / len(train_label)
            #acc.append(accuracy.detach().numpy())
            loss.backward()  # backpropagation
            optimizer.step()  # parameter update
            
            if epoch % 10 == 0:
                accuracy = torch.sum(torch.argmax(output, dim=1) == train_labels) / len(train_labels)
                train_prediction_ = output.argmax(1)
                mcc_train = matthews_corrcoef(train_labels, train_prediction_)
                
                print(f"Epoch: {epoch}")
                print(" Train data: ")
                print(f"   Loss: {loss}")
                print(f"   Accuracy: {accuracy}") 
                print(f"   MCC: {mcc_train}")

                val_out = model(val_features)
                loss_ = loss_fn(val_out, val_labels)
                val_accuracy = torch.sum(torch.argmax(val_out, dim=1) == val_labels) / len(val_labels)
                val_pred_ = val_out.argmax(1)
                val_mcc = matthews_corrcoef(val_labels, val_pred_)
                
                print("Validation data: ")
                print(f"   Loss: {loss_}")
                print(f"   Accuracy: {val_accuracy}")
                print(f"   MCC: {val_mcc}")
                
                if epoch == 30:
                    # append train data results
                    train_loss.append(loss.detach().numpy())
                    train_acc.append(accuracy.detach().numpy())
                    train_prediction.append(output.argmax(1))

                    # append validation data results                    
                    val_loss.append(loss_.detach().numpy())
                    val_acc.append(val_accuracy.detach().numpy())
                    val_pred.append(val_out.argmax(1))    

                    # only show the plots after hyperparameter tuning! 
                    cm = confusion_matrix(train_labels, train_prediction[i])
                    cm2 = confusion_matrix(val_labels, val_pred[i])
                    ConfusionMatrixDisplay(cm).plot() 
                    plt.savefig(f"{visualizations}/NN Fold {i+1} Train.svg")  
                    ConfusionMatrixDisplay(cm2).plot()                                        
                    plt.savefig(f"{visualizations}/NN Fold {i+1} Validation.svg")
                    
                # use MCC score to evaluate the model
                if epoch == 30 and i == 9:   # assume that the last epoch and last fold has the best score
                    mcc_final = matthews_corrcoef(val_labels, val_pred[i])  
                    print(f"    MCC Final: {mcc_final}")
        
    return mcc_final

In [None]:
def test_model(model):
    global pred_test
    pred_test = []

    for i in range(len(test)):
        print(f"********** Fold {i+1} test data: **********")
        test_features = torch.tensor(df.iloc[test[i]][columns].values)
        test_features = test_features.to(torch.float) 
        test_labels = torch.tensor(df.iloc[test[i]]["label"].values)
        test_labels = test_labels.to(torch.long)

        output = model(test_features)
        pred_test.append(output.argmax(1))
        #loss = loss_fn(output, test_labels)
        accuracy = torch.sum(torch.argmax(output, dim=1) == test_labels) / len(test_labels)
        mcc_test = matthews_corrcoef(test_labels, pred_test[i])
        
        print("Test data:")
        #print(f"   Loss: {loss}")
        print(f"   Accuracy: {accuracy}")
        print(f"   MCC: {mcc_test}")

### Hyperparameter tuning

In [None]:
param_grid = {
    'hidden_channels': [2, 4, 8],
    'learning_rate': [0.001, 0.0001],
    'weight_decay': [5e-4, 1e-4],
    'weighted_loss': [100, 120, 140, 160, 180, 200],
}

grid = ParameterGrid(param_grid)

In [None]:
# finding best hyperparameters
best_val_acc = -np.inf
best_params = None

for params in grid:
    print(f"Hyperparameters: weighted_loss={params['weighted_loss']}, hidden_channels={params['hidden_channels']}, learning_rate={params['learning_rate']}, weight_decay={params['weight_decay']}")
    model = MLP(len(columns), params['hidden_channels']*2, params['hidden_channels'], 2)

    fit(model, params['learning_rate'], params['weight_decay'], params['weighted_loss'])
    
    # find best hyperparameters using the MCC score
    if mcc_final > best_val_acc:
        best_val_acc = mcc_final
        best_params = params

print(f"Best Hyperparameters: {best_params}, Best Validation Accuracy: {best_val_acc}")

In [None]:
# Best Hyperparameters: {'hidden_channels': 8, 'learning_rate': 0.001, 'weight_decay': 0.0005, 'weighted_loss': 180}, 
# Best Validation Accuracy: 0.03052772429762823

In [None]:
nn_model = MLP(len(columns), 16, 8, 2)

In [None]:
fit(nn_model, 0.001, 0.0005, 160)

In [None]:
test_model(nn_model)

### Evaluation 

In [None]:
def cm_test(n):
    for i in range(n):
        cm = confusion_matrix(test_labels[i], pred_test[i])
        mcc = matthews_corrcoef(test_labels[i], pred_test[i])
        print(mcc)
        ConfusionMatrixDisplay(cm).plot()
        plt.savefig(f"{visualizations}/Confusion matrix NN - fold {i+1} Test.png")
cm_test(10)

In [None]:
def evaluation(test_arr, pred_test):
    #print("MCC scores: ")
    global data
    mcc_gcn, precision_gcn, recall_gcn, accuracy_gcn = [], [], [], []
    for i in range(10):
        mcc = matthews_corrcoef(test_arr[i], pred_test[i])
        precision = precision_score(test_arr[i], pred_test[i])
        recall = recall_score(test_arr[i], pred_test[i])
        accuracy = accuracy_score(test_arr[i], pred_test[i])
        #print(f"Fold {i+1}: {mcc}")
        mcc_gcn.append(mcc)
        precision_gcn.append(precision)
        recall_gcn.append(recall)
        accuracy_gcn.append(accuracy)
    data = [mcc_gcn, precision_gcn, recall_gcn, accuracy_gcn]
    return data
    
#evaluation(train_labels, train_prediction)
#evaluation(val_labels, val_pred)
evaluation(test_labels, pred_test)

In [None]:
eval_columns = ["MCC", "Precision", "Recall", "Accuracy"]

def export_results(model, data, columns):   
    evaluation = "../evaluation"
    if not os.path.exists(evaluation):
        os.makedirs(evaluation)
        
    df = pd.DataFrame()
    for i in range(len(data)):
        df[columns[i]] = pd.Series(data[i])
    
    df.index += 1 
    df.to_csv(f"{evaluation}/{model}.csv", index_label="ID")
    return df

#export_results("NN_Train", data, eval_columns) 
#export_results("NN_Validation", data, eval_columns) 
export_results("NN_Test", data, eval_columns) 