## Random Forest and XGBoost as a baseline predictor

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure  # for the shap plots
import os
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, matthews_corrcoef, accuracy_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedGroupKFold, GridSearchCV
import torch
import xgboost as xgb
import shap   # explainability

In [None]:
df = pd.read_csv("../data/baseline_features.csv")

In [None]:
# convert features and labels into torch tensors
features = df.iloc[:, 3:len(df.columns)-1] 
X = torch.tensor(features.values)
X = X.to(torch.float)  
columns = list(df.columns.values)[3:len(df.columns)-1]
print(columns)
y = torch.tensor(df["label"].values)

### Stratified K-Fold sampling 

In [None]:
# train, validation and test sets

def kfold_new(train_value, val_value):
    global train, val, test, train_val, train_labels, val_labels, test_labels
    train, val, test, train_val = [], [], [], []
    train_labels, val_labels, test_labels = [], [], []
    
    kf = StratifiedGroupKFold(n_splits=train_value, shuffle=False)
    groups = df["drugcomb_sorted"].to_list() 
    
    # train/val: 90%, test: 10% -> first split
    for i, (train_val_idx, test_idx) in enumerate(kf.split(X, y, groups)): 
        print(f"Fold {i+1}:")
        print(f" Train and Validation: index={train_val_idx[:20]}")  # 80%
        
        train_val_groups = np.array(groups)[train_val_idx.astype(int)]
        train_val_y = df.iloc[train_val_idx]["label"]

        # add the indices and labels
        train_val.append(train_val_idx)
        test.append(test_idx)
        test_labels.append(df.iloc[test_idx]["label"].values)
        
        # train: 80%, val: 10% -> second split
        inner_skf = StratifiedGroupKFold(n_splits=val_value, shuffle=False) 
        train_idx, val_idx = next(inner_skf.split(df.iloc[train_val_idx], train_val_y, train_val_groups))    

        # combine train and validation indies
        arr1, arr2 = train_idx, val_idx
        arr = [*arr1, *arr2]
        arr.sort()

        # create dictionary for the mapping
        list1 = arr  # new index
        list2 = train_val_idx  # old index
        d1 = {}
        for i in range(len(list1)):  # everything: train + val
            d1[list1[i]] = list2[i]

        # convert the new to the original indices 
        old_idx = []   
        old_idx_ = []
        for i in range(len(train_idx)):
            old_idx.append(d1.get(train_idx[i])) 

        for i in range(len(val_idx)):
            old_idx_.append(d1.get(val_idx[i])) 

        # check whether the 3 sets have overlapping elements
        """print("Check for any overlap between train and validation")
        print(list(set(old_idx).intersection(old_idx_)))

        print("Check for any overlap between train and test")
        print(list(set(old_idx).intersection(test_idx)))

        print("Check for any overlap between validation and test")
        print(list(set(old_idx_).intersection(test_idx)))"""
 
        print(f"     Train: index={old_idx[:20]}, length={len(old_idx)}") 
        print(f"     Validation: index={old_idx_[:20]}, length={len(old_idx_)}") 
        
        train.append(old_idx)
        train_labels.append(df.iloc[old_idx]["label"].values)  
        val.append(old_idx_)
        val_labels.append(df.iloc[old_idx_]["label"].values) 

        print(f" Test:  index={test_idx[:20]}, length={len(test_idx)}")  # 10% of the total
        print("*"*100)

# 80% train, 10% val, 10% test (final version)
kfold_new(10,9)

# 70% train, 10% validation, 20% test 
#kfold_new(5,8)

# 60% train, 20% val, 20% test
# kfold_new(5,4)

In [None]:
# train and test sets (old version without validation set)

"""
k = 5  
k_fold = StratifiedGroupKFold(n_splits=k, shuffle=False) 
groups = df["drugcomb_sorted"].to_list()   # avoid data leakage

train_arr = []
test_arr = []

for i, (train_index, test_index) in enumerate(k_fold.split(X, y, groups)):
    print(f"Fold {i+1}:")
    print(f" Train: index={train_index}")
    print(f" Test:  index={test_index}")
    train_features = df.iloc[train_index][columns]
    #print(train_features)
    train_arr.append(df.iloc[train_index]["label"].values)
    test_arr.append(df.iloc[test_index]["label"].values)
"""

## Random Forest

### Grid search and hyperparameter tuning

#### Random Forest

In [None]:
param_grid = [
    {'n_estimators': [50, 100, 150],    # Number of trees in the forest
    'max_depth': [10, 15, 20],       
    'min_samples_split': [2, 10],        # Minimum number of samples required to split an internal node
    'min_samples_leaf': [2, 5],          # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2', None]}
]

grid_search = GridSearchCV(random_forest, param_grid, cv=5, scoring="matthews_corrcoef", verbose=3, return_train_score=False)  # cv = cross validation sets
grid_search.fit(X, y)

In [None]:
grid_search.best_params_
"""
{'max_depth': 10,
 'max_features': 'log2',
 'min_samples_leaf': 5,
 'min_samples_split': 10,
 'n_estimators': 100}
"""

#### XGBoost

In [None]:
xgboost = xgb.XGBClassifier(random_state=0)

In [None]:
# grid search / hyperparameter tuning
param_grid2 = [
    {'eta': [0.1,0.4,0.7],         
    'max_depth': [10,15,20],       
    'max_leaf_nodes': [8,16,32,64],    
    'gamma': [0.4,0.6,0.8],
    'n_estimators': [50,100,150],
    'lambda': [0,0.5,1]}
]

grid_search2 = GridSearchCV(xgboost, param_grid2, cv=5, scoring="matthews_corrcoef", verbose=3, return_train_score=False)  # cv = number of folds
grid_search2.fit(X, y)

In [None]:
grid_search.best_params_
"""
{'eta': 0.1,
 'gamma': 0.4,
 'lambda': 0.5,
 'max_depth': 20,
 'max_leaf_nodes': 8,
 'n_estimators': 150}
"""

### Training 

In [None]:
visualizations = "../visualizations"
if not os.path.exists(visualizations):
    os.makedirs(visualizations)

In [None]:
# train, validation and test data
train_pred = []
val_pred = []
test_pred = []

# initialized the classifier in each fold to avoid overfitting
for i in range(len(train)):
    print(f"********** Fold {i+1}: **********")    
    # choose either the random forest or xgboost model
    classifier = RandomForestClassifier(max_depth=15, min_samples_leaf=2, class_weight="balanced")
    #classifier = xgb.XGBClassifier(eta=0.7, gamma=0.4, reg_lambda=None, max_depth=30, max_leaf_nodes=None, n_estimators=150)
    
    train_features = torch.tensor(df.iloc[train[i]][columns].values)
    train_features = train_features.to(torch.float) 
    train_label = torch.tensor(df.iloc[train[i]]["label"].values)
    train_label = train_label.to(torch.long)

    # train data
    classifier.fit(train_features, train_label)
    pred = classifier.predict(train_features)
    train_pred.append(pred)
    print("Train accuracy:", accuracy_score(train_label, pred))
    print("MCC:", matthews_corrcoef(train_label, pred))
    cm1 = confusion_matrix(train_label, train_pred[i])
    ConfusionMatrixDisplay(cm1).plot() 

    # save the random forest or xgboost result
    #plt.savefig(f"{visualizations}/Confusion matrix RF train - fold {i+1}.png")  
    #plt.savefig(f"{visualizations}/Confusion matrix XGB train - fold {i+1}.png")   
    
    # validation data
    val_features = torch.tensor(df.iloc[val[i]][columns].values)
    val_features = val_features.to(torch.float) 
    val_label = torch.tensor(df.iloc[val[i]]["label"].values)
    val_label = val_label.to(torch.long)
    val_ = classifier.predict(val_features)
    val_pred.append(val_)
    print("Validation accuracy:", accuracy_score(val_label, val_pred[i]))
    print("MCC:", matthews_corrcoef(val_label, val_pred[i]))
    cm2 = confusion_matrix(val_label, val_pred[i])
    ConfusionMatrixDisplay(cm2).plot()  

    #plt.savefig(f"{visualizations}/Confusion matrix RF validation - fold {i+1}.png")  
    #plt.savefig(f"{visualizations}/Confusion matrix XGB validation - fold {i+1}.png")  
    
    # test data
    test_features = torch.tensor(df.iloc[test[i]][columns].values)
    test_features = test_features.to(torch.float)         
    test_label = torch.tensor(df.iloc[test[i]]["label"].values)
    test_label = test_label.to(torch.long)
    pred_ = classifier.predict(test_features)
    test_pred.append(pred_)
    print("Test accuracy:", accuracy_score(test_label, test_pred[i])) 
    print("MCC:", matthews_corrcoef(test_label, test_pred[i]))   
    cm3 = confusion_matrix(test_label, test_pred[i]) 
    ConfusionMatrixDisplay(cm3).plot()

    #plt.savefig(f"{visualizations}/Confusion matrix RF test - fold {i+1}.png")  
    #plt.savefig(f"{visualizations}/Confusion matrix XGB test - fold {i+1}.png")  

## Explainability

#### Feature importance

In [None]:
importances = classifier.feature_importances_
feature_imp_df = pd.DataFrame({'Feature': feature_columns, 'Gini Importance': importances}).sort_values('Gini Importance', ascending=False) 
print(feature_imp_df)

#feature_imp_df.to_csv("Feature_Importance_RF.csv", index=False)
#feature_imp_df.to_csv("Feature_Importance_XGB.csv", index=False)

#### Shapley values

In [None]:
shap.initjs() 

# choose the best fold: the best test folds below have the MCC score of 0.19
n = 9 # random forest: test fold 10
#n = 4   # xgboost: test fold 5

test_shap = df.iloc[test[n]][columns]  # fold 9 in the test set had the best MCC score
explainer = shap.TreeExplainer(classifier) 

In [None]:
shap_values = explainer.shap_values(test_shap)

#### Decision Plot

#### Array structure of the shap_values in the random forest classifier
When plotting, only use one column, e.g. shap_values[:,:,0] or shap_values[:,:,1]

print(shap_values[:,:,0])   # column 0: class 0, column 1: class 1, first index: sample, second index: feature, third index: column

In [None]:
print(explainer.expected_value)
print(shap_values)

In [None]:
fig = figure(figsize=(30, 30), dpi=80)   # specify output figure size
# random forest
# positive values
shap.decision_plot(explainer.expected_value[0], shap_values[n][:, 0], test_shap.iloc[n], ignore_warnings=True, show=False) 
fig.tight_layout()   
#fig.savefig("shap_decision_plot_RF.png")

# negative values
shap.decision_plot(explainer.expected_value[1], shap_values[n][:, 1], test_shap.iloc[n], ignore_warnings=True, show=False) 
fig.tight_layout()  
#fig.savefig("shap_decision_plot2_RF.png")

# xgboost
shap.decision_plot(explainer.expected_value, shap_values, test_shap.iloc[n], ignore_warnings=True, show=False)  # ignore_warnings to plot more samples
fig.tight_layout()   # avoid labels being cut off
#fig.savefig("shap_decision_plot_XGB.png")

#### Bar plot: absolute values of shapley values

In [None]:
# random forest
shap.summary_plot(shap_values[:,:,0], features.iloc[test[n]], plot_type="bar", show=False)
plt.savefig("shap_bar.png")

# xgboost
#shap.summary_plot(shap_values, features.iloc[test[n]], plot_type="bar", show=False)
#plt.savefig("shap_bar_XGB.png")

#### Summary plot

In [None]:
# random forest
shap.summary_plot(shap_values[:,:,0], features.iloc[test[n]], show=False) 
plt.savefig('shap.png')

# xgboost
#shap.summary_plot(shap_values, features.iloc[test[n]], show=False) 
#plt.savefig('shap_XGB.png')

#### Force plot

In [None]:
# explanations for one instance
# random forest
force_plot_rf1 = shap.force_plot(explainer.expected_value[0], shap_values[9][:, 0], test_shap.iloc[0], show=False) 
# shap.save_html("force_plot.html", force_plot_rf1)  # the plot doesn't get exported with plt.savefig and only produces an empty file
force_plot_rf2 = shap.force_plot(explainer.expected_value[1], shap_values[9][:, 1], test_shap.iloc[0], show=False) 
# shap.save_html("force_plot2.html", force_plot_rf2)

# xgboost
force_plot_xgb = shap.force_plot(explainer.expected_value, shap_values[n], test_shap.iloc[0], show=False) 
#shap.save_html("force_plot_XGB.html", force_plot_xgb)

### Evaluation
Create confusion matrices and evaluation scores

In [None]:
def evaluation(test_labels, test_pred):
    global data
    mcc_, precision_, recall_, accuracy_ = [], [], [], []
    for i in range(10):
        mcc = matthews_corrcoef(test_labels[i], test_pred[i])
        precision = precision_score(test_labels[i], test_pred[i])
        recall = recall_score(test_labels[i], test_pred[i])
        accuracy = accuracy_score(test_labels[i], test_pred[i])
        mcc_.append(mcc)
        precision_.append(precision)
        recall_.append(recall)
        accuracy_.append(accuracy)
        
        cm = confusion_matrix(test_labels[i], test_pred[i])
        ConfusionMatrixDisplay(cm).plot()
        
    data = [mcc_, precision_, recall_, accuracy_]
    return data

# export the train/validation/test evaluation
#evaluation(train_labels, train_pred)
#evaluation(val_labels, val_pred)
evaluation(test_labels, test_pred)

### Export results

In [None]:
eval_columns = ["MCC", "Precision", "Recall", "Accuracy"]

def export_results(model, data, eval_columns):
    evaluation = "../evaluation"
    if not os.path.exists(evaluation):
        os.makedirs(evaluation)
        
    df_results = pd.DataFrame()
    for i in range(len(data)):
        df_results[eval_columns[i]] = pd.Series(data[i])
    
    df_results.index += 1 
    df_results.to_csv(f"{evaluation}/{model}.csv", index_label="ID")
    return df_results

# random forest
export_results("Random_Forest_Train", data, eval_columns)
export_results("Random_Forest_Validation", data, eval_columns)
export_results("Random_Forest_Test", data, eval_columns)
    
# xgboost
#export_results("XGB_Train", data, eval_columns)
#export_results("XGB_Validation", data, eval_columns)
#export_results("XGB_Test", data, eval_columns) 