In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from utils import evaluate_model, train_and_evaluate, create_train_test_sampled_split, create_sampling_datasets
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [2]:
df_preprocessed = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed.csv", sep=";")
df_preprocessed_feature_selection = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed_general_feature_selection.csv", sep=";")

In [3]:
samplers = {"RUS": RandomUnderSampler(random_state=42), "ROS": RandomOverSampler(random_state=42)}
prep_data = create_train_test_sampled_split(df_preprocessed, df_preprocessed_feature_selection)

In [4]:
feature_set_list, sampling_list, result_list = [],[],[]

for feature_set in prep_data.keys():
    # Train and Evaluate non sampled data
    feature_set_list.append(feature_set)
    sampling_list.append(None)
    result_list.append(train_and_evaluate(LogisticRegression(max_iter=500),
                                                         prep_data[feature_set]["X_train"],
                                                         prep_data[feature_set]["y_train"],
                                                         prep_data[feature_set]["X_test"],
                                                         prep_data[feature_set]["y_test"]))
        
    dataset = create_sampling_datasets(samplers,prep_data[feature_set]["X_train"],prep_data[feature_set]["y_train"])
    
    for sampling_tech in dataset.keys():
        # Train and Evaluate sampled data
        feature_set_list.append(feature_set)
        sampling_list.append(sampling_tech)
        result_list.append(train_and_evaluate(LogisticRegression(max_iter=500),
                                                         dataset[sampling_tech][0],
                                                         dataset[sampling_tech][1],
                                                         prep_data[feature_set]["X_test"],
                                                         prep_data[feature_set]["y_test"]))

metrics = {"sampling_mode":sampling_list,
          "feature_set":feature_set_list}
for vals, metric in zip([[row[key] for row in result_list] for key in result_list[0].keys()], result_list[0].keys()):
    metrics[metric] = vals

pd.DataFrame(metrics).sort_values("auc_score", ascending=False)

Unnamed: 0,sampling_mode,feature_set,accuracy,f1_score,precision,recall,auc_score
5,ROS,Selected Features,0.606894,0.344759,0.259731,0.512555,0.611608
4,RUS,Selected Features,0.597357,0.348078,0.258481,0.532742,0.611158
3,,Selected Features,0.799722,0.037249,0.619048,0.019202,0.611056
1,RUS,All Features,0.596861,0.346327,0.257362,0.529296,0.608959
2,ROS,All Features,0.608484,0.34784,0.261964,0.517479,0.608803
0,,All Features,0.799821,0.039103,0.621212,0.020187,0.608585
