In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, classification_report, confusion_matrix, precision_score, \
    recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
import os
import joblib

# Einlesen der Daten
df = pd.read_csv('data/churn_data_encoded.csv')
df_test = pd.read_csv('data/churn_data.csv')

print(f"Anzahl der Testdaten: {len(df_test)}")
print(f"Anzahl der Testdaten nach Entfernung von NaN: {len(df_test.dropna())}")


# Definieren der benutzerdefinierten Bewertungsfunktion und Hilfsfunktionen
def custom_score(y_true, y_pred):
    fp = np.sum((y_pred == 1) & (y_true == 0))
    fn = np.sum((y_pred == 0) & (y_true == 1))
    total = len(y_true)
    score = (fp + 5 * fn) / total
    return score


def print_scores(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    custom = custom_score(y_true, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"Custom Score: {custom:.2f}")
    return accuracy, precision, recall, f1, custom


def evaluate_classification(X, y, classifier):
    custom_scorer = make_scorer(custom_score, greater_is_better=False)
    cv = StratifiedKFold(shuffle=True, n_splits=5, random_state=42)

    scores = cross_validate(
        classifier,
        X,
        y,
        cv=cv,
        scoring={
            'accuracy': 'accuracy',
            'precision': 'precision',
            'recall': 'recall',
            'f1': 'f1',
            'custom_score': custom_scorer,
        }
    )
    return scores


# Datenaufteilung in Trainings- und Testset, Balancierung und Skalierung
X = df.drop('Churn', axis=1)
y = df['Churn']

# Aufteilung der Daten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Balance the classes
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print(f"Balancierte Klassenverteilung im Trainingsset:\n{y_train_resampled.value_counts()}")

# Scale features
scaler = MinMaxScaler()
X_train_resampled_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)

# Modelltraining und Evaluierung
clf = LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42)
scores = evaluate_classification(X_train_resampled_scaled, y_train_resampled, clf)


# Hyperparameter-Optimierung mit GridSearchCV
def apply_grid_search(classifier, param_grid, X, y, cv=5, scoring='accuracy'):
    grid_search = GridSearchCV(classifier, param_grid, cv=cv, scoring=scoring)
    grid_search.fit(X, y)
    print("Best hyperparameters:")
    print(grid_search.best_params_)
    return grid_search.best_estimator_


param_grid = {
    'max_iter': [5000, 10000],
    'solver': ['lbfgs', 'liblinear'],
    'C': [0.1, 1, 10]
}

best_model = apply_grid_search(LogisticRegression(random_state=42), param_grid, X_train_resampled_scaled,
                               y_train_resampled, cv=5, scoring='accuracy')

# Endgültiges Modelltraining und Evaluierung
best_model.fit(X_train_resampled_scaled, y_train_resampled)
y_train_pred = best_model.predict(X_train_resampled_scaled)
y_train_pred_proba = best_model.predict_proba(X_train_resampled_scaled)[:, 1]

print("Training classification report:")
print(classification_report(y_train_resampled, y_train_pred))

conf_matrix_train = confusion_matrix(y_train_resampled, y_train_pred)
print("Training confusion matrix:")
print(conf_matrix_train)

# Save model
os.makedirs('models', exist_ok=True)
joblib.dump(best_model, 'models/logistic_regression_model.pkl')

# Testen und Visualisierung
# Testdaten vorhersagen
y_test_pred = best_model.predict(X_test_scaled)
y_test_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Ausgabe der Testergebnisse
print("Test classification report:")
print(classification_report(y_test, y_test_pred))

conf_matrix_test = confusion_matrix(y_test, y_test_pred)
print("Test confusion matrix:")
print(conf_matrix_test)

# Plotting ROC-Kurve für Testdaten
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_test_pred_proba)
roc_auc = roc_auc_score(y_test, y_test_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.grid(True)
plt.show()

# Plotting Konfusionsmatrix als Heatmap für Testdaten
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_test, annot=True, fmt='d', cmap='Blues', xticklabels=['No Churn', 'Churn'],
            yticklabels=['No Churn', 'Churn'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plotting der vorhergesagten Wahrscheinlichkeiten für Testdaten
results = pd.DataFrame({
    'True Label': y_test,
    'Predicted Probability': y_test_pred_proba
})

plt.figure(figsize=(10, 6))
sns.scatterplot(x=np.arange(len(results)), y='Predicted Probability', hue='True Label', data=results, palette='Set1',
                s=100, alpha=0.6, edgecolor='k')
plt.axhline(0.5, ls='--', color='red')
plt.xlabel('Sample Index')
plt.ylabel('Predicted Probability of Churn')
plt.title('Predicted Probabilities by True Class')
plt.legend(title='True Label', labels=['No Churn', 'Churn'])
plt.grid(True)
plt.show()

# Plotting: Entscheidungsgrenze
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_resampled_scaled)

log_reg = LogisticRegression(max_iter=2000, solver='lbfgs', random_state=42)
log_reg.fit(X_train_pca, y_train_resampled)


def plot_decision_boundary(model, X, y):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01),
                         np.arange(y_min, y_max, 0.01))
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor='k')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xlabel('PCA Feature 1')
    plt.ylabel('PCA Feature 2')
    plt.title('Decision Boundary')
    plt.show()


plot_decision_boundary(log_reg, X_train_pca, y_train_resampled)

# Pairplot für die ersten beiden PCA-Komponenten
df_pca = pd.DataFrame(X_train_pca, columns=['PCA1', 'PCA2'])
df_pca['Churn'] = y_train_resampled

sns.pairplot(df_pca, hue='Churn', markers=["o", "s"], palette='Set1')
plt.show()
