### Comparison between different algorithms without improvements

* Load the preprocessed data
* Select the desired sample size
* Use different ML algorithms
* Save results for comparison with basic versions
* Compare **all** results via plot

In [15]:
# import all libraries used in "algorithm_comparisons.ipynb"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [3]:
# load "preprocessed_df.csv"
df = pd.read_csv("preprocessed_df.csv")

In [4]:
sample_size = 0.05  # change if necessary

# Zufällige Stichprobenauswahl
df_sample = df.sample(frac=sample_size, random_state=42)

# Teilen Sie die Stichprobe in Trainings- und Testsets
X_sample = df_sample.drop('income', axis=1)
y_sample = df_sample['income']

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

In [9]:
# best mlp classifier

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('mlp', MLPClassifier(random_state=42))
])

# Definition des Gitters von Hyperparametern
param_grid = {
    'smote__sampling_strategy': [0.75, 1.0],
    'mlp__hidden_layer_sizes': [(50,), (100,)],
    'mlp__activation': ['relu', 'tanh'],
    'mlp__learning_rate_init': [0.001, 0.01]
}

# Initialisierung von GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)

# Ausführen von GridSearchCV
grid_search.fit(X_train, y_train)

# Beste Parameter anzeigen
print("Beste Parameter:", grid_search.best_params_)

# Bewertung auf dem Testset
best_mlp = grid_search.best_estimator_
y_pred_mlp = best_mlp.predict(X_test)

# Metriken berechnen
accuracy_mlp_improved = accuracy_score(y_test, y_pred_mlp)
precision_mlp_improved = precision_score(y_test, y_pred_mlp, zero_division=0)
recall_mlp_improved = recall_score(y_test, y_pred_mlp, zero_division=0)
f1_mlp_improved = f1_score(y_test, y_pred_mlp, zero_division=0)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

Fitting 3 folds for each of 16 candidates, totalling 48 fits
Beste Parameter: {'mlp__activation': 'relu', 'mlp__hidden_layer_sizes': (50,), 'mlp__learning_rate_init': 0.001, 'smote__sampling_strategy': 0.75}
Accuracy: 0.832
Precision: 0.6277113767153608
Recall: 0.7381572097865695
F1-Score: 0.6784688995215311


In [13]:
# Metriken berechnen
accuracy_mlp_improved = accuracy_score(y_test, y_pred_mlp)
precision_mlp_improved = precision_score(y_test, y_pred_mlp, zero_division=0)
recall_mlp_improved = recall_score(y_test, y_pred_mlp, zero_division=0)
f1_mlp_improved = f1_score(y_test, y_pred_mlp, zero_division=0)

print(f"Accuracy: {accuracy_mlp_improved}")
print(f"Precision: {precision_mlp_improved}")
print(f"Recall: {recall_mlp_improved}")
print(f"F1-Score: {f1_mlp_improved}")

Accuracy: 0.832
Precision: 0.6277113767153608
Recall: 0.7381572097865695
F1-Score: 0.6784688995215311


In [11]:
# best xgboost classifier

pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Definition des Gitters von Hyperparametern
param_grid = {
    'smote__sampling_strategy': [0.75, 1.0],
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [3, 6, 10, 15, 20],
    'xgb__learning_rate': [0.01, 0.1],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0]
}

# Initialisierung von GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)

# Ausführen von GridSearchCV
grid_search.fit(X_train, y_train)

# Beste Parameter anzeigen
print("Beste Parameter:", grid_search.best_params_)

# Bewertung auf dem Testset
best_xgb = grid_search.best_estimator_
y_pred_xgb = best_model.predict(X_test)

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Beste Parameter: {'smote__sampling_strategy': 0.75, 'xgb__colsample_bytree': 0.8, 'xgb__learning_rate': 0.1, 'xgb__max_depth': 6, 'xgb__n_estimators': 100, 'xgb__subsample': 0.8}
Accuracy: 0.832
Precision: 0.6277113767153608
Recall: 0.7381572097865695
F1-Score: 0.6784688995215311


In [12]:
# Metriken berechnen
accuracy_xgb_improved = accuracy_score(y_test, y_pred_xgb)
precision_xgb_improved = precision_score(y_test, y_pred_xgb, zero_division=0)
recall_xgb_improved = recall_score(y_test, y_pred_xgb, zero_division=0)
f1_xgb_improved = f1_score(y_test, y_pred_xgb, zero_division=0)

print(f"Accuracy: {accuracy_xgb_improved}")
print(f"Precision: {precision_xgb_improved}")
print(f"Recall: {recall_xgb_improved}")
print(f"F1-Score: {f1_xgb_improved}")

Accuracy: 0.832
Precision: 0.6277113767153608
Recall: 0.7381572097865695
F1-Score: 0.6784688995215311


In [16]:
# best random forest
pipeline = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

# Definition des Gitters von Hyperparametern
param_grid = {
    'smote__sampling_strategy': [0.75, 1.0],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [5, 10, 20, None],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', 'log2']
}

# Initialisierung von GridSearchCV
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=3, scoring='f1', n_jobs=-1, verbose=2)

# Ausführen von GridSearchCV
grid_search.fit(X_train, y_train)

# Beste Parameter anzeigen
print("Beste Parameter:", grid_search.best_params_)

# Bewertung auf dem Testset
best_rf = grid_search.best_estimator_
y_pred_rf = best_rf.predict(X_test)

Fitting 3 folds for each of 128 candidates, totalling 384 fits
Beste Parameter: {'rf__max_depth': 20, 'rf__max_features': 'log2', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 5, 'rf__n_estimators': 200, 'smote__sampling_strategy': 0.75}


In [17]:
# Metriken berechnen
accuracy_rf_improved = accuracy_score(y_test, y_pred_rf)
precision_rf_improved = precision_score(y_test, y_pred_rf, zero_division=0)
recall_rf_improved = recall_score(y_test, y_pred_rf, zero_division=0)
f1_rf_improved = f1_score(y_test, y_pred_rf, zero_division=0)

print(f"Accuracy: {accuracy_rf_improved}")
print(f"Precision: {precision_rf_improved}")
print(f"Recall: {recall_rf_improved}")
print(f"F1-Score: {f1_rf_improved}")

Accuracy: 0.838
Precision: 0.6349892008639308
Recall: 0.7652264445601249
F1-Score: 0.6940509915014164
