In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from utils import evaluate_model, create_balanced_dataset, train_and_evaluate_classifiers

In [2]:
df_preprocessed = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed.csv", sep=";").sample(frac=1).reset_index(drop=True)
df_preprocessed_feature_selection = pd.read_csv("../../data/preprocessed/PSP_Jan_Feb_2019_preprocessed_general_feature_selection.csv", sep=";").sample(frac=1).reset_index(drop=True)

In [3]:
df_balanced = create_balanced_dataset(df_preprocessed,42)
df_balanced_feature_selection = create_balanced_dataset(df_preprocessed_feature_selection,42)

In [18]:
df_balanced

Unnamed: 0,amount,success,3D_secured,previous_attempts,PSP_Moneycard,PSP_Simplecard,PSP_UK_Card,country_Germany,country_Switzerland,card_Master,...,hour_20,hour_21,hour_22,hour_23,week_day_1,week_day_2,week_day_3,week_day_4,week_day_5,week_day_6
25614,0.320513,False,False,2,False,True,False,True,False,True,...,False,False,False,False,False,True,False,False,False,False
21755,0.269231,False,False,0,False,True,False,True,False,False,...,True,False,False,False,False,False,True,False,False,False
25893,0.323718,False,True,1,True,False,False,True,False,True,...,False,False,False,False,True,False,False,False,False,False
36285,0.500000,False,False,0,False,False,True,True,False,True,...,True,False,False,False,False,False,True,False,False,False
15452,0.169872,False,False,2,False,False,True,True,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50308,0.738782,True,False,0,True,False,False,False,True,True,...,False,False,False,False,False,False,False,False,True,False
50309,0.743590,True,False,0,False,False,False,False,True,True,...,False,False,False,False,False,False,False,True,False,False
50311,0.748397,True,False,0,True,False,False,False,True,True,...,False,False,False,False,False,False,True,False,False,False
50318,0.759615,True,False,0,False,False,True,False,True,False,...,False,True,False,False,False,False,False,False,True,False


## Modell Auswahl

In [25]:
classifiers = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Gradient Boosting': GradientBoostingClassifier()
}

## Ganzer Datensatz Kreuzvalidierung, um Basismodelle zu bestimmen

In [32]:
X = df_balanced_feature_selection.copy()
y = X.pop("success")

results = train_and_evaluate_classifiers(classifiers, X, y)

Evaluation results for Decision Tree:
Accuracy: 0.5505958887890903
Precision: 0.5578452558004883
Recall: 0.4876792692282873
F1 Score: 0.5203732885747183
--------------------------------------
Evaluation results for Random Forest:
Accuracy: 0.5621819515121886
Precision: 0.5629853590447338
Recall: 0.555826695060073
F1 Score: 0.5593009695525837
--------------------------------------
Evaluation results for Logistic Regression:
Accuracy: 0.5828116603382529
Precision: 0.5930565583340479
Recall: 0.528353493129895
F1 Score: 0.5587995756727219
--------------------------------------
Evaluation results for Gradient Boosting:
Accuracy: 0.6205510367392075
Precision: 0.6147694205809533
Recall: 0.6465595460879
F1 Score: 0.6301358908011465
--------------------------------------


In [42]:
X = df_balanced.copy()
y = X.pop("success")

results = train_and_evaluate_classifiers(classifiers, X, y)

Evaluation results for Decision Tree:
Accuracy: 0.5507433048680161
Precision: 0.5518547033156556
Recall: 0.5401839357372128
F1 Score: 0.5459507008843281
--------------------------------------
Evaluation results for Random Forest:
Accuracy: 0.5881401918667041
Precision: 0.5900653799046653
Recall: 0.5775303472456245
F1 Score: 0.5836701540142875
--------------------------------------
Evaluation results for Logistic Regression:
Accuracy: 0.5797812736777892
Precision: 0.588339352620727
Recall: 0.5311889619437533
F1 Score: 0.5582618108846483
--------------------------------------
Evaluation results for Gradient Boosting:
Accuracy: 0.6192314722755266
Precision: 0.6064755863850114
Recall: 0.6791138771578868
F1 Score: 0.6407097397845141
--------------------------------------


#### Frage: macht man das wirklich mit dem ganzen Datensatz? -> Ja das beurteilt ja nur die Performance
#### Warum ist der Unterschied so groß? -> Durch den concat Schritt bekommt CV ein Problem, da nur ähnliche Werte ausgewählt werden.

## Professionelles Over/Undersampling

In [5]:
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler

In [34]:
def create_sampling_datasets(df, samplers):
    X = df
    y = X.pop("success")
    data = {}
    for i_sampler in samplers.keys():
        df_resampled, y_resampled = samplers[i_sampler].fit_resample(X, y)
        data[i_sampler] = (df_resampled, y_resampled)
    return data

In [29]:
def perform_cv_with_sampling(df, classifiers, samplers):
    X = df
    y = X.pop("success")
    res_list = []
    for i_sampler in samplers.keys():
        df_resampled, y_resampled = samplers[i_sampler].fit_resample(X, y)
        results = train_and_evaluate_classifiers(classifiers, df_resampled, y_resampled)
        for res in results:
            res["sampler"] = i_sampler
            res_list.append(res)
        
    return pd.DataFrame(res_list)

In [30]:
samplers = {"RUS": RandomUnderSampler(), "ROS": RandomOverSampler(), "ToL": TomekLinks()}
res = perform_cv_with_sampling(df_balanced.copy(), classifiers, samplers)

Evaluation results for Decision Tree:
Accuracy: 0.5587112401086427
Precision: 0.5586944137890008
Recall: 0.5590539355221112
F1 Score: 0.5588651138824069
--------------------------------------
Evaluation results for Random Forest:
Accuracy: 0.5945443267330022
Precision: 0.5964633399896353
Recall: 0.5854524422392551
F1 Score: 0.5908481612628438
--------------------------------------
Evaluation results for Logistic Regression:
Accuracy: 0.5825185007836147
Precision: 0.5919518618058455
Recall: 0.5311911607597387
F1 Score: 0.5598499019297544
--------------------------------------
Evaluation results for Gradient Boosting:
Accuracy: 0.6199651118864026
Precision: 0.6102249679012777
Recall: 0.6644530803738943
F1 Score: 0.6361077361558738
--------------------------------------
Evaluation results for Decision Tree:
Accuracy: 0.5516723403757096
Precision: 0.5525046150374988
Recall: 0.5428238533294137
F1 Score: 0.5475202478029535
--------------------------------------
Evaluation results for Random 

In [32]:
res.sort_values("classifier")

Unnamed: 0,classifier,accuracy,precision,recall,f1,sampler
0,Decision Tree,0.558711,0.558694,0.559054,0.558865,RUS
4,Decision Tree,0.551672,0.552505,0.542824,0.54752,ROS
8,Decision Tree,0.599978,0.530648,0.521119,0.525665,ToL
3,Gradient Boosting,0.619965,0.610225,0.664453,0.636108,RUS
7,Gradient Boosting,0.624756,0.614966,0.667483,0.640109,ROS
11,Gradient Boosting,0.640643,0.606961,0.441,0.510755,ToL
2,Logistic Regression,0.582519,0.591952,0.531191,0.55985,RUS
6,Logistic Regression,0.582909,0.592598,0.530799,0.559964,ROS
10,Logistic Regression,0.624804,0.610591,0.326559,0.425478,ToL
1,Random Forest,0.594544,0.596463,0.585452,0.590848,RUS


#### Erkenntnisse:
 - ToL erhöht die Genauigkeit bei einer Reduzierung von den anderen Metriken
 - RUS und ROS erzielen ähnliche Ergebnisse, Performancegewinn ist abhängig vom Modell

## Hyperparameter Tuning

In [55]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, recall_score

In [36]:
# Generate data
data = create_sampling_datasets(df_balanced, samplers)

In [41]:
data["ToL"][1] 

0        False
1        False
2        False
3        False
4        False
         ...  
17799     True
17800     True
17801     True
17802     True
17803     True
Name: success, Length: 17804, dtype: bool

#### Logistische Regression

In [73]:
parameters = {'solver':('lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky')}
clf = LogisticRegression(max_iter = 10000, penalty = "l2")
clf = GridSearchCV(clf, parameters, refit = "f1", scoring=["f1","precision","recall","accuracy"], cv = 5)
clf.fit(data["RUS"][0], data["RUS"][1])

['mean_fit_time',
 'mean_score_time',
 'mean_test_accuracy',
 'mean_test_f1',
 'mean_test_precision',
 'mean_test_recall',
 'param_solver',
 'params',
 'rank_test_accuracy',
 'rank_test_f1',
 'rank_test_precision',
 'rank_test_recall',
 'split0_test_accuracy',
 'split0_test_f1',
 'split0_test_precision',
 'split0_test_recall',
 'split1_test_accuracy',
 'split1_test_f1',
 'split1_test_precision',
 'split1_test_recall',
 'split2_test_accuracy',
 'split2_test_f1',
 'split2_test_precision',
 'split2_test_recall',
 'split3_test_accuracy',
 'split3_test_f1',
 'split3_test_precision',
 'split3_test_recall',
 'split4_test_accuracy',
 'split4_test_f1',
 'split4_test_precision',
 'split4_test_recall',
 'std_fit_time',
 'std_score_time',
 'std_test_accuracy',
 'std_test_f1',
 'std_test_precision',
 'std_test_recall']

In [80]:
clf.best_score_

0.5581839609160666

In [81]:
parameters = {'solver':('liblinear', 'saga')}
clf = LogisticRegression(max_iter = 10000, penalty = "l1")
clf = GridSearchCV(clf, parameters, refit = "f1", scoring=["f1","precision","recall","accuracy"], cv = 5)
clf.fit(data["RUS"][0], data["RUS"][1])

In [82]:
clf.best_score_

0.558753853241555

# Random Forest

In [84]:
parameters = {'n_estimators':(25, 100, 150, 200), 
              'criterion':('gini','entropy','log_loss'), 
              'max_depth':(3,5,7,9,11), 
              'min_samples_split':(2,4,6,8)
             }
clf = RandomForestClassifier()
clf = GridSearchCV(clf, parameters, refit = "f1", scoring=["f1","precision","recall","accuracy"], cv = 5)
clf.fit(data["RUS"][0], data["RUS"][1])

In [87]:
clf.best_estimator_

In [88]:
clf.best_score_

0.6516990660670758

# Gradient Boosting

In [89]:
parameters = {'n_estimators':(25, 100, 200), 
              'criterion':('friedman_mse','squared_error'), 
              'max_depth':(3,7,11,15), 
              'min_samples_split':(2,4,8),
              'min_samples_leaf':(2,4,8)
             }
clf_GB = GradientBoostingClassifier()
clf_GB = GridSearchCV(clf_GB, parameters, refit = "f1", scoring=["f1","precision","recall","accuracy"], cv = 5)
clf_GB.fit(data["RUS"][0], data["RUS"][1])

In [90]:
clf_GB.best_estimator_

In [92]:
clf_GB.cv_results_

{'mean_fit_time': array([ 0.27401934,  1.07083707,  2.14350314,  0.28793955,  1.07682567,
         2.11454144,  0.2793644 ,  1.06119485,  2.11113868,  0.2769968 ,
         1.05989842,  2.12201056,  0.27722063,  1.06871715,  2.10847669,
         0.27287216,  1.07666049,  2.11038356,  0.27432046,  1.06776199,
         2.12374797,  0.27505927,  1.05814199,  2.09387436,  0.27501044,
         1.0558146 ,  2.10952144,  0.64178333,  2.46748481,  4.85532522,
         0.63632569,  2.44744854,  4.88863344,  0.63473878,  2.43478279,
         4.81550574,  0.63171635,  2.47242923,  4.97396417,  0.64176202,
         2.46393008,  4.83543181,  0.65147777,  2.42867489,  4.81498995,
         0.64131866,  2.40269222,  4.73039145,  0.6441256 ,  2.42750239,
         4.73497229,  0.62735071,  2.42695165,  4.76058331,  1.22896743,
         4.4463973 ,  8.77156057,  1.21886024,  4.4306406 ,  8.82508259,
         1.13803277,  4.13695364,  8.30886836,  1.16738896,  4.1175961 ,
         8.0883451 ,  1.13171792, 

# Feature Selection

False