### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

import tensorflow as tf

### Loading Data

In [2]:
with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/train_set/x_train.pickle", "rb") as file:
    x_train = pickle.load(file)

with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/train_set/y_train.pickle", "rb") as file:
    y_train = pickle.load(file)

with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/internal_test_set/internal_x_test.pickle", "rb") as file:
    internal_x_test = pickle.load(file)

with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/internal_test_set/internal_y_test.pickle", "rb") as file:
    internal_y_test = pickle.load(file)

with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/external_test_set/external_x_test.pickle", "rb") as file:
    external_x_test = pickle.load(file)

with open("../../../data/1_all_active_random_inactive/2DRDKit/for_ml/external_test_set/external_y_test.pickle", "rb") as file:
    external_y_test = pickle.load(file)

### Lists of Parameters

In [5]:
random_forest_parameters = [
    {
        "model_name": "Random Forest",
        "n_estimators": 100
    },
    {
        "model_name": "Random Forest",
        "n_estimators": 250
    },
    {
        "model_name": "Random Forest",
        "n_estimators": 500
    },
    {
        "model_name": "Random Forest",
        "n_estimators": 700
    }
]

In [6]:
mlp_parameters = [
    {
        "model_name": "MLP",
        "size_of_int_layer": 200
    },
    {
        "model_name": "MLP",
        "size_of_int_layer": 500
    },
    {
        "model_name": "MLP",
        "size_of_int_layer": 750
    },
    {
        "model_name": "MLP",
        "size_of_int_layer": 1000
    }
]

### Training/Validation Loop

In [7]:
with open("./training_testing_results.csv", "w") as log_file:
    log_file.write("n_estimators,int_precision,int_recall,int_fbeta,ext_precision,ext_recall,ext_fbeta\n")
    for n, model_params in enumerate(random_forest_parameters):
        print(n)
        # log_file.write(f"Model {n}\n")
        # log_file.write(f"Penalty: {model_params["penalty"]}, C: {model_params["C"]}\n")
        log_file.write(f"{model_params['n_estimators']},")
        
        # first we create a model with one of the optimized parameter sets
        print("model fits")
        rf_model = RandomForestClassifier(n_estimators=model_params["n_estimators"], random_state=42)
        rf_model.fit(x_train, y_train)

        # then a model trained using these parameters is tested on the internal test set
        # I use the classes as predictions, because the confusion matrix is calculated like this
        # however, I think the production model should return probabilities
        model_predictions = rf_model.predict(internal_x_test)
        print("model predicted")

        # the values of the testing on the internal test set are then used to select the best model
        # ? which <average> to select?? 
        # selected micro, idk why
        precision, recall, fbeta, support = precision_recall_fscore_support(internal_y_test, model_predictions, average='micro')
        log_file.write(f"{precision},{recall},{fbeta},")
        
        print("int stats calculated")

        # then the model is tested on an external test set to assess its "real" performance
        model_predictions_for_external = rf_model.predict(external_x_test)
        ext_precision, ext_recall, ext_fbeta, ext_support = precision_recall_fscore_support(external_y_test, model_predictions_for_external, average='micro')
        print("ext stats calculated")
        log_file.write(f"{ext_precision},{ext_recall},{ext_fbeta}\n")

0
model fits
model predicted
int stats calculated
ext stats calculated
1
model fits
model predicted
int stats calculated
ext stats calculated
2
model fits
model predicted
int stats calculated
ext stats calculated
3
model fits
model predicted
int stats calculated
ext stats calculated


In [8]:
best_rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [9]:
best_rf_model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(internal_y_test, best_rf_model.predict(internal_x_test))
fpr, tpr, thresholds = roc_curve(internal_y_test, best_rf_model.predict_proba(internal_x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('svm_ROC.png', dpi=600)
plt.show()