### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_formats = ['svg']

### Loading Data

In [2]:
with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/train_set/x_train.pickle", "rb") as file:
    x_train = pickle.load(file)

with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/train_set/y_train.pickle", "rb") as file:
    y_train = pickle.load(file)

with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/internal_test_set/internal_x_test.pickle", "rb") as file:
    internal_x_test = pickle.load(file)

with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/internal_test_set/internal_y_test.pickle", "rb") as file:
    internal_y_test = pickle.load(file)

with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/external_test_set/external_x_test.pickle", "rb") as file:
    external_x_test = pickle.load(file)

with open("../../../../data/1_all_active_random_inactive/Mordred/for_ml/external_test_set/external_y_test.pickle", "rb") as file:
    external_y_test = pickle.load(file)

### Creating the Model

In [3]:
svm_params = [
    {
        "C": 0.001,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 0.01,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 0.1,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 1,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 10,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 100,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": True,
    },
    {
        "C": 0.001,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 0.01,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 0.1,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 1,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 10,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 100,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": True,
    },
    {
        "C": 0.001,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 0.01,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 0.1,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 1,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 10,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 100,
        "kernel": "rbf",
        "gamma": "scale",
        "shrinking": False,
    },
    {
        "C": 0.001,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    },
    {
        "C": 0.01,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    },
    {
        "C": 0.1,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    },
    {
        "C": 1,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    },
    {
        "C": 10,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    },
    {
        "C": 100,
        "kernel": "rbf",
        "gamma": "auto",
        "shrinking": False,
    }
]

In [4]:
print(len(svm_params))

24


In [6]:
with open("./training_testing_results.csv", "w") as log_file:
    log_file.write("c,kernel,gamma,shrinking,int_precision,int_recall,int_fbeta,ext_precision,ext_recall,ext_fbeta\n")
    for n, model_params in enumerate(svm_params):
        print(n)
        # log_file.write(f"Model {n}\n")
        # log_file.write(f"Penalty: {model_params["penalty"]}, C: {model_params["C"]}\n")
        log_file.write(f"{model_params['C']},{model_params['kernel']},{model_params['gamma']},{model_params['shrinking']},")
        
        # first we create a model with one of the optimized parameter sets
        print("model fits")
        svm_model = SVC(C=model_params["C"], kernel=model_params["kernel"], 
                        gamma=model_params["gamma"], shrinking=model_params["shrinking"])
        svm_model.fit(x_train, y_train)

        # then a model trained using these parameters is tested on the internal test set
        # I use the classes as predictions, because the confusion matrix is calculated like this
        # however, I think the production model should return probabilities
        model_predictions = svm_model.predict(internal_x_test)
        print("model predicted")

        # the values of the testing on the internal test set are then used to select the best model
        # ? which <average> to select?? 
        # selected micro, idk why
        precision, recall, fbeta, support = precision_recall_fscore_support(internal_y_test, model_predictions, average='micro')
        log_file.write(f"{precision},{recall},{fbeta},")
        
        print("int stats calculated")

        # then the model is tested on an external test set to assess its "real" performance
        model_predictions_for_external = svm_model.predict(external_x_test)
        ext_precision, ext_recall, ext_fbeta, ext_support = precision_recall_fscore_support(external_y_test, model_predictions_for_external, average='micro')
        print("ext stats calculated")
        log_file.write(f"{ext_precision},{ext_recall},{ext_fbeta}\n")

0
model fits
model predicted
int stats calculated
ext stats calculated
1
model fits
model predicted
int stats calculated
ext stats calculated
2
model fits
model predicted
int stats calculated
ext stats calculated
3
model fits
model predicted
int stats calculated
ext stats calculated
4
model fits
model predicted
int stats calculated
ext stats calculated
5
model fits
model predicted
int stats calculated
ext stats calculated
6
model fits
model predicted
int stats calculated
ext stats calculated
7
model fits
model predicted
int stats calculated
ext stats calculated
8
model fits
model predicted
int stats calculated
ext stats calculated
9
model fits
model predicted
int stats calculated
ext stats calculated
10
model fits
model predicted
int stats calculated
ext stats calculated
11
model fits
model predicted
int stats calculated
ext stats calculated
12
model fits
model predicted
int stats calculated
ext stats calculated
13
model fits
model predicted
int stats calculated
ext stats calculated
14

In [7]:
best_svm_model = SVC(C=10, kernel="rbf", gamma="scale", shrinking=True)

In [8]:
best_svm_model.fit(x_train, y_train)

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(internal_y_test, best_svm_model.predict(internal_x_test))
# fpr, tpr, thresholds = roc_curve(internal_y_test, best_svm_model.predict_proba(internal_x_test)[:,1])
# plt.figure()
# plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic')
# plt.legend(loc="lower right")
# plt.savefig('svm_ROC.png', dpi=600)
# plt.show()

In [10]:
logit_roc_auc

0.5

In [11]:
with open("./mordred_random_366_svm.pickle", "wb") as file:
    pickle.dump(best_svm_model, file)