# 4.3.4 Modellentwicklung & 4.3.5 Evaluation
In diesem Notebook wird eine Pipeline erstellt, mit dieser werden die Datensätze automatisiert auf verschiedene Modelle trainiert und evaluiert.


Der DP-WGAN lieferte die besten Ergebnisse, wenn er auf ein standard skaliertes Datenset trainiert wurde.
Die generierten synthetischen Daten sind aus diesem Grund ebenfalls in Standardverteilung.


Aus diesem Grund wird in diesem Notebook hauptsächlich auf standard skalierten Daten gearbeitet.
Somit sind df_train und df_test die realen, skalierten Daten.

#### Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.metrics import plot_confusion_matrix
from sklearn import preprocessing, metrics

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier

#### Utils

In [2]:
def draw_auc_roc_curve(y_test, y_pred, save_path=""):
    """For a given y_test, y_pred it plots the roc curve, with the auc score.
    If save_path is given it will save the plot.
    """
    fpr,tpr,_ = metrics.roc_curve(y_test,y_pred)
    rocAuc = metrics.auc(fpr, tpr)
    plt.figure(figsize=(12,6))
    plt.title('ROC Kurve')
    sns.lineplot(fpr, tpr, label = 'AUC für Modell = %0.2f' % rocAuc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.ylabel('Richtig Positiv Rate')
    plt.xlabel('Falsch Positiv Rate')
    if save_path != "":
        plt.savefig(save_path, format="svg", bbox_inches="tight")
    plt.show()

def xy_split(table:pd.DataFrame):
    """Takes a full Give Me some Credit Dataset
        Returns X, y splitted Dataset
    """
    table.drop("Unnamed: 0", axis=1, errors="ignore", inplace=True)
    y = table[["SeriousDlqin2yrs"]]
    y.reset_index(drop=True, inplace=True)
    X = table.drop("SeriousDlqin2yrs", axis=1)
    X.reset_index(drop=True, inplace=True)
    return X, y

def calc_auc_real(model, draw_curve=True, scaled=True):
    """Calculates the auroc value of the model on the real Testset
    """
    if scaled:
        proba = model.predict_proba(df_test_X)[:, 1]
        if draw_curve:
            draw_auc_roc_curve(df_test_y, proba)
        return roc_auc_score(df_test_y, proba)
    else:
        proba = model.predict_proba(df_test_X_unscaled)[:, 1]
        if draw_curve:
            draw_auc_roc_curve(df_test_y_unscaled, proba)
        return roc_auc_score(df_test_y_unscaled, proba)

def calc_f1_real(model, scaled=True):
    """Calculates the f1 score of the model on the real Testset
    """
    if scaled:
        proba = model.predict(df_test_X)
        return f1_score(df_test_y, proba)
    else:
        proba = model.predict(df_test_X_unscaled)
        return f1_score(df_test_y_unscaled, proba)

#### Lade Datensets

In [3]:
# Lade unskalierte Originaldaten
df_full_unscaled = pd.read_csv("../01_EDA_Preprocessing/output_data/processed_dataset.csv")

df_train_unscaled = pd.read_csv("../01_EDA_Preprocessing/output_data/full_train_processed.csv")
df_test_unscaled = pd.read_csv("../01_EDA_Preprocessing/output_data/full_test_processed.csv")


# Lade skalierte Originaldaten
df_train = pd.read_csv("../01_EDA_Preprocessing/output_data/full_train_processed_scaled.csv")
df_test = pd.read_csv("../01_EDA_Preprocessing/output_data/full_test_processed_scaled.csv")


# ---------------------------------------------------------------------------------------
# Synthetische Datensätze

# Synthetische Trainingssets ohne Privatsphäreeinschränkungen mit unterschiedlichen Skalierungen
df_full_no_700_s = pd.read_csv("../02_Data_Generation/data/full-no-700-s.csv") # StandardScaler
df_full_no_700_a = pd.read_csv("../02_Data_Generation/data/full-no-700-a.csv") # MaxAbsScaler
df_full_no_700_m = pd.read_csv("../02_Data_Generation/data/full-no-700-m.csv") # MinMaxScaler
df_full_no_700_r = pd.read_csv("../02_Data_Generation/data/full-no-700-r.csv") # RobustScaler


# Ab hier sind alle Datensätze Standardskaliert
# Synthetische Daten ohne Privatsphäre Einschränkungen mit unterschiedlicher Anzahl Epochen
df_eINF_s0_50 = pd.read_csv("../02_Data_Generation/data/eINF-s0-50.csv") # 50 Epochen
df_eINF_s0_100 = pd.read_csv("../02_Data_Generation/data/eINF-s0-100.csv") # 100 Epochen
df_eINF_s0_300 = pd.read_csv("../02_Data_Generation/data/eINF-s0-300.csv") # 300 Epochen
df_eINF_s0_500 = pd.read_csv("../02_Data_Generation/data/eINF-s0-500.csv") # 500 Epochen
df_eINF_s0_800 = pd.read_csv("../02_Data_Generation/data/eINF-s0-800.csv") # 800 Epochen


# Synthetische Daten mit unterschiedlich starken Privatsphäre Einschränkungen
# e = Epsilon, s = Sigma
# Epsilon 0.02
df_e002_s60 = pd.read_csv("../02_Data_Generation/data/e002-s60.csv")
df_e002_s80 = pd.read_csv("../02_Data_Generation/data/e002-s80.csv")
df_e002_s120 = pd.read_csv("../02_Data_Generation/data/e002-s120.csv")

# Epsilon 0.05
df_e005_s40 = pd.read_csv("../02_Data_Generation/data/e005-s40.csv")
df_e005_s50 = pd.read_csv("../02_Data_Generation/data/e005-s50.csv")
df_e005_s60 = pd.read_csv("../02_Data_Generation/data/e005-s60.csv")

# Epsilon 0.1
df_e01_s15 = pd.read_csv("../02_Data_Generation/data/e01-s15.csv")
df_e01_s20 = pd.read_csv("../02_Data_Generation/data/e01-s20.csv")
df_e01_s25 = pd.read_csv("../02_Data_Generation/data/e01-s25.csv")

# Epsilon 1
df_e1_s18 = pd.read_csv("../02_Data_Generation/data/e1-s18.csv")
df_e1_s2 = pd.read_csv("../02_Data_Generation/data/e1-s2.csv")
df_e1_s25 = pd.read_csv("../02_Data_Generation/data/e1-s25.csv")

# Epsilon 3
df_e3_s09 = pd.read_csv("../02_Data_Generation/data/e3-s09.csv")
df_e3_s1 = pd.read_csv("../02_Data_Generation/data/e3-s1.csv")
df_e3_s14 = pd.read_csv("../02_Data_Generation/data/e3-s14.csv")

# Epsilon 5
df_e5_s07 = pd.read_csv("../02_Data_Generation/data/e5-s07.csv")
df_e5_s08 = pd.read_csv("../02_Data_Generation/data/e5-s08.csv")
df_e5_s09 = pd.read_csv("../02_Data_Generation/data/e5-s09.csv")

# Epsilon 8
df_e8_s06 = pd.read_csv("../02_Data_Generation/data/e8-s06.csv")
df_e8_s065 = pd.read_csv("../02_Data_Generation/data/e8-s065.csv")
df_e8_s07 = pd.read_csv("../02_Data_Generation/data/e8-s07.csv")

# Epsilon 12
df_e12_s05 = pd.read_csv("../02_Data_Generation/data/e12-s05.csv")
df_e12_s055 = pd.read_csv("../02_Data_Generation/data/e12-s055.csv")
df_e12_s06 = pd.read_csv("../02_Data_Generation/data/e12-s06.csv")

# Epsilon 16
df_e16_s05 = pd.read_csv("../02_Data_Generation/data/e16-s05.csv")
df_e16_s055 = pd.read_csv("../02_Data_Generation/data/e16-s055.csv")
df_e16_s057 = pd.read_csv("../02_Data_Generation/data/e16-s057.csv")

#### Datensets vorbereiten

In [4]:
# Teile Testset in X, y auf
df_test_X, df_test_y = xy_split(df_test)
df_test_X_unscaled, df_test_y_unscaled = xy_split(df_test_unscaled)

## Modell Entwicklung 4.3.4

- Alle Hyperparameter, die für das Hyperparametertuning verwendet werden

In [5]:
model_params = {
    "LogisticRegression": {
        "model": LogisticRegression(),
        "params": {
            "max_iter": [50, 80, 100, 150, 200, 300],
            "C": [0.1, 0.5, 0.8, 1, 3, 5, 10, 15],
            "class_weight": ["None", "balanced"]
        }
    },
    "KNeighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [2, 3, 5, 10, 15, 20, 30, 40, 55, 60, 70],
            "weights": ["uniform", "distance"]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [2,5,10,20,30,None],
            "min_samples_split": [2, 5, 10],
            "criterion": ["gini", "entropy"]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [50, 100, 200, 300],
            "max_depth": [5, 10, 20, None]
        }
    },
    "XGBoost": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [100, 200, 300, 400],
            "learning_rate": [0.05, 0.07, 0.1]
        }
    },
    "DummyClassifier": {
        "model": DummyClassifier(),
        "params": {
            "strategy": ["stratified"]
        }
    }
}

- Die Parameter des am besten performenden Modelles auf den realen skalierten Daten
- Wird ebenfalls auf den generierten Daten getestet, um Overfitting auf synthetische Daten entgegenzuwirken

In [6]:
model_params_best = {
    "LogisticRegression": {
        "model": LogisticRegression(),
        "params": {
            "max_iter": [200],
            "C": [1],
            "class_weight": ["balanced"]
        }
    },
    "KNeighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [55],
            "weights": ["uniform"]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [5],
            "criterion": ["entropy"]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [200],
            "max_depth": [10]
        }
    },
    "XGBoost": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [400],
            "learning_rate": [0.05]
        }
    },
    "DummyClassifier": {
        "model": DummyClassifier(),
        "params": {
            "strategy": ["stratified"]
        }
    }
}

- Die verwendeten Modell mit den Standardwerten

In [7]:
model_params_standard = {
    "LogisticRegression": {
        "model": LogisticRegression(),
        "params": {
            "max_iter": [100],
            "C": [1],
            "class_weight": [None]
        }
    },
    "KNeighbors": {
        "model": KNeighborsClassifier(),
        "params": {
            "n_neighbors": [5],
            "weights": ["uniform"]
        }
    },
    "DecisionTreeClassifier": {
        "model": DecisionTreeClassifier(),
        "params": {
            "max_depth": [None],
            "criterion": ["gini"]
        }
    },
    "RandomForest": {
        "model": RandomForestClassifier(),
        "params": {
            "n_estimators": [100],
            "max_depth": [None]
        }
    },
    "XGBoost": {
        "model": GradientBoostingClassifier(),
        "params": {
            "n_estimators": [100],
            "learning_rate": [0.1]
        }
    },
    "DummyClassifier": {
        "model": DummyClassifier(),
        "params": {
            "strategy": ["stratified"]
        }
    }
}

## Erstelle Evaluations Pipeline

In [8]:
def _train_pipeline(X, y, scoring_para, model_definition):
    """Trains all Models with given Hyperparameters from model_definition with GridSearchCV for given X, y
    Returns the best_score_, best_params_, and the best model for each Modeldefinition as a Dataframe
    """
    
    print("Start Training")
    
    scores = []
    
    # Runs GridSearchCV foreach Model from model_definition
    for model_name, mp in model_definition.items():
       
        clf =  GridSearchCV(mp['model'], mp['params'], cv=5, scoring=scoring_para, verbose=1)
        clf.fit(X, y.values.ravel())
        
        scores.append({
            'model': model_name,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_,
            "clf": clf
        })
        print(f"{model_name}: {clf.best_score_}, {clf.best_params_}")
    
    return pd.DataFrame(scores,columns=['model','best_score','best_params', "clf"])

In [9]:
def evaluate_pipeline(table, scaled=True, scoring_para="roc_auc", draw_curve=True, model_definition=model_params):
    """Takes as input a Dataset, it splits the table. It runs the _train_pipeline() for the given model_definition.
    It gives a first look at the best result.
    Returns all results as Dataframe.
    """
    X, y = xy_split(table)
    
    print("Data Splitted")
    result = _train_pipeline(X, y, scoring_para, model_definition)
    
    best_model = result["clf"][result["best_score"].idxmax()]
    
    if scaled:
        # Die Metriken werden auf ein Standard skaliertes Testset getestet
        if draw_curve: 
            plot_confusion_matrix(best_model, df_test_X, df_test_y)
        print(f"AUC Score: {calc_auc_real(best_model, draw_curve=draw_curve)}")
        print(f"F1 Score: {calc_f1_real(best_model)}")

    return result

In [10]:
def test_models_on_real_data(models_df, scaled=True):
    """Takes as input the results from evaluate_pipeline()
    It tests each model on the real Testsets with AUROC and F1 Score
    """
    array = []
    for idx, model in enumerate(models_df["clf"]):
        array.append([])
        array[idx].append(model.estimator)
        array[idx].append(calc_auc_real(model, draw_curve=False, scaled=scaled))
        array[idx].append(calc_f1_real(model, scaled=scaled))
        array[idx].append(model.best_params_)
    df_result = pd.DataFrame(data=array,columns=['Model', 'ROC AUC', 'F1', "Params"])
    df_result.loc['mean'] = df_result.mean()
    return df_result

# 3.3.5 Evaluierung
- Wende die entwickelte Evaluationspipeline auf die realen & generierten Daten an
- GridSearchCV testet alle möglichen Kombinationen der Hyperparameter mit der Kreuzvalidierung.
    - Die Berechnungen auf einen einzelnen Datensatz mit allen Hyperparametern, nimmt dementsprechend einiges an Zeit in Anspruch. ~1h
    - Besonders RandomForestClassifier und GradientBoostingClassifier benötigen dabei viel Zeit. 

### 4.3.5.1	Vergleich realer Datensatz und Datensatz mit Standardverteilung
#### Wende Pipeline auf die realen untransformierten Daten an
- Da die Daten nicht skaliert sind, wird die Logistische Regression für kleinere max_iter Werte eine ConvergenceWarning ausgeben.
- Aus diesen Grund werden für die Evaluierung auf den realen untransformierten Daten, für die Logistische Regression, andere Hyperparameter gewählt, als für die skalierten Daten.

In [11]:
model_params_real = model_params
model_params_real["LogisticRegression"] = {
        "model": LogisticRegression(),
        "params": {
            "max_iter": [200, 300, 500, 1000],
            "C": [0.1, 0.5, 0.8, 1, 3, 5, 10, 15],
            "class_weight": ["None", "balanced"]
        }
}
    
result_real_unscaled = evaluate_pipeline(table=df_train_unscaled, scaled=False, model_definition=model_params_real)
result_eINF_s0_unscaled = test_models_on_real_data(result_real_unscaled, scaled=False)
result_eINF_s0_unscaled.head(7)

# result_eINF_s0_unscaled.to_csv("results/eINF-s0-real-unscaled.csv")

Data Splitted
Start Training
Fitting 5 folds for each of 64 candidates, totalling 320 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogisticRegression: 0.843249352528294, {'C': 0.5, 'class_weight': 'balanced', 'max_iter': 300}
Fitting 5 folds for each of 22 candidates, totalling 110 fits
KNeighbors: 0.6289005492855779, {'n_neighbors': 70, 'weights': 'distance'}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
DecisionTreeClassifier: 0.834448380779526, {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 10}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
RandomForest: 0.8557239156519383, {'max_depth': 10, 'n_estimators': 300}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
XGBoost: 0.8561575010580429, {'learning_rate': 0.05, 'n_estimators': 400}
Fitting 5 folds for each of 1 candidates, totalling 5 fits
DummyClassifier: 0.5016354623563584, {'strategy': 'stratified'}


Unnamed: 0,Model,ROC AUC,F1,Params
0,LogisticRegression(),0.841193,0.325348,"{'C': 0.5, 'class_weight': 'balanced', 'max_it..."
1,KNeighborsClassifier(),0.629261,0.000968,"{'n_neighbors': 70, 'weights': 'distance'}"
2,DecisionTreeClassifier(),0.828791,0.246471,"{'criterion': 'entropy', 'max_depth': 5, 'min_..."
3,RandomForestClassifier(),0.851751,0.249132,"{'max_depth': 10, 'n_estimators': 300}"
4,GradientBoostingClassifier(),0.853002,0.282379,"{'learning_rate': 0.05, 'n_estimators': 400}"
5,DummyClassifier(),0.505837,0.064706,{'strategy': 'stratified'}
mean,,0.751639,0.194834,


#### Wende die Pipeline auf die realen skalierten Daten an

- optimiert auf AUROC Score

In [None]:
result_eINF_s0 = evaluate_pipeline(df_train)

test_eINF_s0 = test_models_on_real_data(result_eINF_s0)
test_eINF_s0.head(7)
# test_eINF_s0.to_csv("results/eINF-s0-real.csv")

Data Splitted
Start Training
Fitting 5 folds for each of 64 candidates, totalling 320 fits
LogisticRegression: 0.846721819754962, {'C': 15, 'class_weight': 'balanced', 'max_iter': 200}
Fitting 5 folds for each of 22 candidates, totalling 110 fits
KNeighbors: 0.83590209897019, {'n_neighbors': 70, 'weights': 'uniform'}
Fitting 5 folds for each of 36 candidates, totalling 180 fits
DecisionTreeClassifier: 0.834448380779526, {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 5}
Fitting 5 folds for each of 16 candidates, totalling 80 fits
RandomForest: 0.8557010763432125, {'max_depth': 10, 'n_estimators': 300}
Fitting 5 folds for each of 12 candidates, totalling 60 fits


#### Erstelle svg Grafik vom besten Modell auf den realen skalierten Daten

In [None]:
best_auc_model = test_eINF_s0["clf"][test_eINF_s0["best_score"].idxmax()]
proba = best_auc_model.predict_proba(df_test_X)[:, 1]

# In der SVG Grafik ist der Hintergrund bei den Labels Transparent. Bei den Schwarzen Hintergrund sieht man desswegen die Labels nicht. 
# draw_auc_roc_curve(df_test_y, proba, save_path="svg/roc_curve_xgboost_real_scaled.svg")

- Trainiere Modelle optimiert auf den den F1 Score
> Wenn optimiert auf F1, sind die F1 Werte auch etwas besser, AUROC wird etwas schlechter.

In [None]:
result_eINF_s0_f1 = evaluate_pipeline(table=df_train, scoring_para="f1", draw_curve=False)

test_eINF_s0_f1 = test_models_on_real_data(result_eINF_s0_f1)
test_eINF_s0_f1.head(7)

## 3.3.5.2	Vergleich realer Datensatz und generierte Daten ohne Differential Privacy
Starte die Pipeline auf jeden der Datensätze und speichere die Ergebnisse als csv Datei

In [None]:
df_eINF_s0_datasets = {
    "eINF_s0_50": df_eINF_s0_50,
    "eINF_s0_100": df_eINF_s0_100, 
    "eINF_s0_300": df_eINF_s0_300, 
    "eINF_s0_500": df_eINF_s0_500, 
    "eINF_s0_800": df_eINF_s0_800
}

for name, dataset in df_eINF_s0_datasets.items():
    print(f"Start evaluating {name}")
    result_models = evaluate_pipeline(table=dataset, draw_curve=False)
    result_test_on_real = test_models_on_real_data(result_models)
#     result_test_on_real.to_csv(f"results/{name}.csv")

## 3.3.5.3	Vergleiche Datensätze mit unterschiedlichen Privatsphäre Einschränkungen
- Definiere alle betrachteten Datensätze mit unterschiedlichen Werten für Epsilon

In [None]:
eX_sX_datasets = {    
    "e002_s60": df_e002_s60,
    "e002_s80": df_e002_s80,
    "e002_s120": df_e002_s120,
    
    "e005_s40": df_e005_s40,
    "e005_s50": df_e005_s50,
    "e005_s60": df_e005_s60,
    
    "e01_s15": df_e01_s15,
    "e01_s20": df_e01_s20,
    "e01_s25": df_e01_s25,
    
    "e1_s18": df_e1_s18,
    "e1_s2": df_e1_s2,
    "e1_s25": df_e1_s25,
    
    "e3_s09": df_e3_s09,
    "e3_s1": df_e3_s1,
    "e3_s14": df_e3_s14,
    
    "e5_s07": df_e5_s07,
    "e5_s08": df_e5_s08,
    "e5_s09": df_e5_s09,
    
    "e8_s06": df_e8_s06,
    "e8_s065": df_e8_s065,
    "e8_s07": df_e8_s07,
    
    "e12_s05": df_e12_s05,
    "e12_s055": df_e12_s055,
    "e12_s06": df_e12_s06,
    
    "e16_s05": df_e16_s05,
    "e16_s055": df_e16_s055,
    "e16_s057": df_e16_s057
}

- Teste alle Datensätze mit automatischen Hyperparametertuning mit allen vorher definierten Hyperparametern

In [None]:
for name, dataset in eX_sX_datasets.items():
    print(f"Start evaluating {name}")
    result_models = evaluate_pipeline(table=dataset, draw_curve=False)
    result_test_on_real = test_models_on_real_data(result_models)
#     result_test_on_real.to_csv(f"results/{name}-01.csv")

#### Um Overfitting beim Hyperparametertuning auf die synthetischen Daten entgegenzuwirken:
- Teste die Datensätze nur mit den Hyperparametern der am besten Performenden Modelle auf den realen Daten

In [None]:
for name, dataset in eX_sX_datasets.items():
    print(f"Start evaluating {name}")
    result_models = evaluate_pipeline(table=dataset, draw_curve=False, model_definition=model_params_best)
    result_test_on_real = test_models_on_real_data(result_models)
#     result_test_on_real.to_csv(f"results/{name}-02.csv")

- Teste die Datensätze mit den Standardeinstellungen der verwendeten Modelle

In [None]:
for name, dataset in eX_sX_datasets.items():
    print(f"Start evaluating {name}")
    result_models = evaluate_pipeline(table=dataset, draw_curve=False, model_definition=model_params_standard)
    result_test_on_real = test_models_on_real_data(result_models)
#     result_test_on_real.to_csv(f"results/{name}-03.csv")

## Erstelle Grafik mit Scores auf unterschiedlichen Epsilons

In [None]:
scores = [
    [0.02, 0.6184],
    [0.05, 0.5641],
    [0.1, 0.4982],
    [1, 0.6150],
    [3, 0.5314],
    [5, 0.6777],
    [8, 0.6858],
    [12, 0.6509],
    [16, 0.6968],
]

df_scores = pd.DataFrame(data=scores, columns=["epsilon", "auc"])
score_plot = sns.regplot(x=df_scores["epsilon"], y=df_scores["auc"], ci=95)
score_plot.set_ylim(0.4, 0.9)
score_plot.set_xlim(0, 17)

- Der lineare Verlauf spiegelt nicht den realen Verlauf der Kurve da. 
- Da der AUROC Score sicherlich nicht über 1 steigen kann.
- Es muss eine Funktion 2. Grades gezeichnet werden.
- Dafür werden die Werte von Epsilon gleich unendlich ebenfalls mit in die Grafik aufgenommen
- Da Epsilon unendlich jedoch nicht in die Grafik gezeichnet werden kann, wird für diese ein Epsilon Wert von 50 gesetzt.

In [None]:
scores = [
    [0.02, 0.6184],
    [0.05, 0.5641],
    [0.1, 0.4982],
    [1, 0.6150],
    [3, 0.5314],
    [5, 0.6777],
    [8, 0.6858],
    [12, 0.6509],
    [16, 0.6968],
    [50, 0.6843], # Epsilon unendlich generiert
    [50, 0.8405] # Epsilon unendlich real
]

df_scores = pd.DataFrame(data=scores, columns=["Epsilon", "AUROC"])
plt.figure(figsize=(6,4))
score_plot = sns.regplot(x=df_scores["Epsilon"], y=df_scores["AUROC"], ci=95, order=2)

score_plot.set_ylim(0.4, 0.9)
score_plot.set_xlim(0, 17)
# plt.savefig("svg/scores_for_different_epsilon.svg")

- Plot mit F1 Score

In [None]:
# f1 Werte sind nicht die echten, es wurde lediglich getestet 
# wie die Grafik in etwa aussehen würde
scores = [      
    [0.02, 0.6184, 0.16],
    [0.05, 0.5641, 0.15],
    [0.1, 0.4982, 0.08],
    [1, 0.6150, 0.16],
    [3, 0.5314, 0.13],
    [5, 0.6777, 0.18],
    [8, 0.6858, 0.19],
    [12, 0.6509, 0.18],
    [16, 0.6968, 0.20], 
    [50, 0.6843, 0.2083], # Epsilon unendlich generiert
    [50, 0.8405, 0.2675] # Epsilon unendlich real
]

df_scores = pd.DataFrame(data=scores, columns=["Epsilon", "ROC AUC", "F1"])
plt.figure(figsize=(9,6))

score_plot = sns.regplot(x=df_scores["Epsilon"], y=df_scores["ROC AUC"], ci=96, order=2)
score_plot = sns.regplot(x=df_scores["Epsilon"], y=df_scores["F1"], ci=96, order=2)

score_plot.set_ylim(0, 0.9)
score_plot.set_xlim(0, 16.5)

## Erstelle Konfusionsmatrix für bestes Modell auf realen Daten

In [None]:
model = LogisticRegression(C=1, class_weight="balanced", max_iter=200)
X_train, y_train = xy_split(df_train)

model.fit(X_train, y_train)
con_matrix = plot_confusion_matrix(model, df_test_X, df_test_y)
plt.xlabel('Vorhersage')
plt.ylabel("Korrekte Werte")

# plt.savefig("svg/best_plot_confusion_matrix.svg")

- Errechne Recall und Percision

In [None]:
recall = recall_score(df_test_y, model.predict(df_test_X))
precision = precision_score(df_test_y, model.predict(df_test_X))

print(f"Precision = {precision}, und Recall = {recall}")

# Fertig

- Ab hier Tests die es nicht in die Arbeit geschafft haben

## Welche Prediction erreicht ein Modell trainiert auf syn getestet auf syn

In [None]:
model = LogisticRegression()
X, y = xy_split(df_e001_s50)

X_train, X_test, y_train, y_test = train_test_split(X, y)

model.fit(X_train, y_train)
print(model.score(X_test, y_test))
plot_confusion_matrix(model, X_test, y_test)
print(f"Matrix des Models auf realen Daten: AUC = {calc_auc_real(model, draw_curve=False)}")
plot_confusion_matrix(model, df_test_X, df_test_y)

## Teste Qualität der Daten
- Kann einen Modell reale von synthetischen Daten unterscheiden?
- Stellt quasi den Diskriminator einer GAN Struktur dar.
> Die Logistische Regression kann nicht unterscheiden, der GradientBoostingClassifier jedoch sehr wohl, mit sehr guter Genauigkeit!

In [None]:
def combine_real_syn(df_real, df_syn):
    df_real_c = df_real.copy()
    df_syn_c = df_syn.copy()
    df_syn_c.drop("Unnamed: 0", axis=1, inplace=True, errors="ignore")
    df_real_c["real"] = 1
    df_syn_c["real"] = 0
    df_result = df_real_c.append(df_syn_c, ignore_index=True)
    df_result = df_result.sample(frac=1) # Shuffle
    return df_result

In [None]:
df_com = combine_real_syn(df_train, df_eINF_s0_100)

X_com = df_com.drop("real", axis=1)
y_com = df_com["real"]

X_train_com, X_test_com, y_train_com, y_test_com = train_test_split(X_com, y_com)

- Logistic Regression

In [None]:
model = LogisticRegression()
model.fit(X_train_com, y_train_com)
print(model.score(X_test_com, y_test_com))
plot_confusion_matrix(model, X_test_com, y_test_com)

- GradientBoostingClassifier

In [None]:
model = GradientBoostingClassifier()
model.fit(X_train_com, y_train_com)
print(model.score(X_test_com, y_test_com))
plot_confusion_matrix(model, X_test_com, y_test_com)

## Normaler Aufbau für einen einzelnen Classifier

In [None]:
X, y = xy_split(df_e3_s09)

model = LogisticRegression()
# model = DecisionTreeClassifier(criterion="entropy", max_depth=5)
# model = DummyClassifier(strategy="most_frequent")
# model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=400)
# model = RandomForestClassifier(n_estimators=200, max_depth=10)

model.fit(X, y)

# Teste Modell auf reale Daten
print(model.score(df_test_X, df_test_y))
plot_confusion_matrix(model, df_test_X, df_test_y)
draw_auc_roc_curve(df_test_y, model.predict_proba(df_test_X)[:,1])