In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE
import pickle

In [None]:
def check_directory(directory_path):
    if not os.path.exists(directory_path):
        try:
            os.makedirs(directory_path)
            print(f"Directory '{directory_path}' created successfully.")
        except OSError as e:
            print(f"Error creating directory '{directory_path}': {e}")
    else:
        print(f"Directory '{directory_path}' already exists.")
    return directory_path

In [None]:
# X_train, y_train, X_test, y_test have already been vectorized and stored in csv files
def train_model(model, X_train, y_train, X_test, y_test, over_sample):
    if over_sample:
        sm = SMOTE(random_state = 2)
        X_train, y_train = sm.fit_resample(X_train, y_train.ravel())
        print("Oversampling Done for Training Data.")

    model = model.fit(X_train, y_train)
    print("Model Fitted Successfully.")

    # calculating y_pred
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    roc_auc = round(roc_auc_score(y_test, y_pred_prob[:, 1]), 2)

    print(f"\n \033[1mROC-AUC Score\033[0m \t\t: {roc_auc*100} %")

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
    
    gmeans = np.sqrt(tpr * (1-fpr))
    
    ix = np.argmax(gmeans)
    print('\033[1mBest Threshold\033[0m \t\t: %.3f \n\033[1mG-Mean\033[0m \t\t\t: %.3f' % (thresholds[ix], gmeans[ix]))

    y_pred = (y_pred > thresholds[ix])

    accuracy = accuracy_score(y_test, y_pred)
    print("\033[1mModel Accuracy\033[0m \t\t:", round(accuracy,2,)*100, "%")

    print("\033[1m\nClassification Report:\033[0m")
    print(classification_report(y_test, y_pred))

    return model, y_pred

In [None]:
def save_model(model, dataset_name, saved_path):
    data = {
    "model": model
    }
    with open(f'../output/{dataset_name}/saved/trained_model.pkl', 'wb') as file:
        pickle.dump(data, file)

In [None]:
check_directory(f"../output/{dataset_name}/training")
check_directory(f"../output/{dataset_name}/saved")

dataset_name = None # to be modified with argparse in .py file
X_train_path = f"../output/{dataset_name}/training/{dataset_name}-Xtrain.csv"
y_train_path = f"../output/{dataset_name}/training/{dataset_name}-ytrain.csv"
X_test_path = f"../output/{dataset_name}/training/{dataset_name}-Xtest.csv"
y_test_path = f"../output/{dataset_name}/training/{dataset_name}-ytest.csv"

X_train, y_train, X_test, y_test = pd.read_csv()

In [None]:
model_LR = LogisticRegression(max_iter=1000)
model_LR, y_pred = train_model()

In [None]:
model_RF = RandomForestClassifier(n_jobs=3, oob_score=True, n_estimators=100, criterion="gini")
model_RF, y_pred = train_model()

In [None]:
model_SVM = SVC(probability=True)
model_SVM, y_pred = train_model()