In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, classification_report
from utils import evaluate_model, train_and_evaluate, create_train_test_sampled_split, create_sampling_datasets
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

In [2]:
# Einlesen der Datensätze
df_preprocessed = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed.csv", sep=";")
df_preprocessed_feature_selection = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed_general_feature_selection.csv", sep=";")

In [3]:
# Festlegen von Samplern RUS und ROS
samplers = {"RUS": RandomUnderSampler(random_state=42), "ROS": RandomOverSampler(random_state=42)}
# Erstellen eines Dictionary, welches Trainings und Testdatensätze für die zuvor eingelesenen Daten enthält.
# Dies geschieht vor dem Sampling, da sonst bei ROS ein Informationsfluss der Trainings- in die Testdaten geschehen kann.
prep_data = create_train_test_sampled_split(df_preprocessed, df_preprocessed_feature_selection)

## Iteratives Training eines Logistic Regression Models

In [4]:
feature_set_list, sampling_list, result_list = [],[],[]

# Für alle Features und Selected Features:
for feature_set in prep_data.keys():
    # Training und Evaluierung ohne Sampling
    feature_set_list.append(feature_set)
    sampling_list.append(None)
    result_list.append(train_and_evaluate(LogisticRegression(max_iter=500),
                                                         prep_data[feature_set]["X_train"],
                                                         prep_data[feature_set]["y_train"],
                                                         prep_data[feature_set]["X_test"],
                                                         prep_data[feature_set]["y_test"]))
    # Erstellen von RUS und ROS Sampling Datensätzen
    dataset = create_sampling_datasets(samplers,prep_data[feature_set]["X_train"],prep_data[feature_set]["y_train"])

    # Für RUS und ROS:
    for sampling_tech in dataset.keys():
        # Training und Evaluierung mit Sampling
        feature_set_list.append(feature_set)
        sampling_list.append(sampling_tech)
        result_list.append(train_and_evaluate(LogisticRegression(max_iter=500),
                                                         dataset[sampling_tech][0],
                                                         dataset[sampling_tech][1],
                                                         prep_data[feature_set]["X_test"],
                                                         prep_data[feature_set]["y_test"]))

# Erstellen eines DataFrame aus den Evaluierungsmetriken.
metrics = {"sampling_mode":sampling_list,
          "feature_set":feature_set_list}
for vals, metric in zip([[row[key] for row in result_list] for key in result_list[0].keys()], result_list[0].keys()):
    metrics[metric] = vals

pd.DataFrame(metrics).sort_values("auc_score", ascending=False)

Unnamed: 0,sampling_mode,feature_set,accuracy,f1_score,precision,recall,auc_score
5,ROS,Selected Features,0.60759,0.347754,0.261615,0.518464,0.611583
3,,Selected Features,0.799622,0.036312,0.612903,0.01871,0.611238
4,RUS,Selected Features,0.59696,0.348273,0.258464,0.533727,0.611124
1,RUS,All Features,0.597357,0.344493,0.256503,0.524372,0.608912
2,ROS,All Features,0.607093,0.346173,0.260577,0.51551,0.608896
0,,All Features,0.799821,0.040933,0.614286,0.021172,0.608763


## Erkenntnisse:
 - Ohne RUS und ROS beherrrscht die a-priori Wahrscheinlichkeit von Misserfolg die Vorhersage des Models.
 - RUS und ROS erzielen akzeptable Benchmarkergebnisse
 - ROS mit ausgewählten Features erziehlt den besten AUC Wert
 - Ausgewählte Features erzielen bessere Ergebnisse, als der Datensatz mit allen Features.