In [None]:
### Train normal forest

#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import pandas as pd
import seaborn as sns
import argparse
import os
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, label_binarize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import multilabel_confusion_matrix
import pickle
import logging
import traceback

# Import features
root_path = os.getcwd().replace("\\", "/") + "/Processing/CT/"
df = pd.read_excel(root_path + 'In vitro_DRR_features_no_id.xlsx')
df.head()
param = df.columns

# Get ground truth
df_truth = pd.read_excel(root_path + "In vitro_DRR_eq_ground truth.xlsx")

# make X and y
X = df.copy()
y = df_truth[['Target']]

# Standard scaler exceptions
exceptions = ["Vertebrae_L1","Vertebrae_L2","Vertebrae_L3","Vertebrae_L4","Sexe"]
cols = X.columns.difference(exceptions).values

numeric_features = X.columns.difference(exceptions).values
from sklearn.preprocessing import StandardScaler
ct = ColumnTransformer(transformers=[('num', StandardScaler(), numeric_features)], remainder="passthrough")

#Define classifier with optimized hyperparameter values previously obtained
classifier = RandomForestClassifier(min_samples_split=2, min_samples_leaf=47,n_estimators=40,max_leaf_nodes=None,random_state=0)

# make confusion  matrix scorer
from sklearn.metrics import make_scorer

scoring = {'precision_score': "precision_weighted", "accuracy_score": make_scorer(accuracy_score), "balanced_accuracy_score": "balanced_accuracy", "recall_score": "recall_weighted", "f1_score": "f1_weighted", "AUC_score": "roc_auc"} #"confusion_matrix": make_scorer(multilabel_confusion_matrix)

from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import ConfusionMatrixDisplay

k_fold = KFold(10,shuffle=True, random_state=0)
i=1

for train_index, test_index in k_fold.split(X):
    #Get data for fold
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index,:], y.iloc[test_index,:]
    X_train = ct.fit_transform(X_train)
    X_test = ct.transform(X_test)
    
    #Fit classifier
    classifier.fit(X_train, y_train.values.ravel())
    
    #Make predictions
    pred = classifier.predict(X_test)
    
    #Save model
    filename = root_path + 'DRR_RF_fold_' + str(i) + '.sav'
    pickle.dump(classifier, open(filename, 'wb')) #save model
    
    #Save predictions
    result = pd.DataFrame()# you create new dataframe
    result['predictions'] = pred# you create column with values
    result.to_excel(root_path + 'DRR_RF_predictions_fold_' + str(i) +  ".xlsx") 
    
    #Obtain confusion matrix (2 formats)
    cm = confusion_matrix(y_test, classifier.predict(X_test))#.ravel()
    print("Fold :" + str(i))
    print(cm.ravel())
    print(pred)
    cm_display = ConfusionMatrixDisplay(cm).plot()
    i+=1
    
#gives scoring averages for 10 folds
results = cross_validate(classifier,X,y,cv=10,scoring=scoring,verbose=0)
#print(results)

#save results to excel
pd.DataFrame.from_dict(results).to_excel(root_path + 'DRR_RF_results.xlsx')