# Baseline Model: Logistic Regression

In [48]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer

df_train = pd.read_csv('../../data/train_2025-02-04_17-10-32.csv')[['churn_status', 'customer_id', 'is_unemployed', 'total_complaints_30d', 'age', 'bank_diff', 'tenure']]
df_test = pd.read_csv('../../data/test_2025-02-04_17-10-33.csv')[['churn_status', 'customer_id', 'is_unemployed', 'total_complaints_30d', 'age', 'bank_diff', 'tenure']]

X_train = df_train.drop(columns=['churn_status', 'customer_id'])
y_train = df_train['churn_status']

X_test = df_test.drop(columns=['churn_status', 'customer_id'])
y_test = df_test['churn_status']

imputer = SimpleImputer(strategy='mean')
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.8503
Precision: 0.4766
Recall: 0.0074
F1 Score: 0.0146
AUC: 0.5735

Confusion Matrix:
[[140243    201]
 [ 24528    183]]

Classification Report:
              precision    recall  f1-score   support

       False       0.85      1.00      0.92    140444
        True       0.48      0.01      0.01     24711

    accuracy                           0.85    165155
   macro avg       0.66      0.50      0.47    165155
weighted avg       0.80      0.85      0.78    165155



In [49]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.coef_[0]
})
feature_importance = feature_importance.sort_values(by='Importance', key=lambda x: abs(x), ascending=False)

print("\nFeature Importance:")
print(feature_importance.to_string(index=False))


Feature Importance:
             Feature  Importance
       is_unemployed   -0.167785
total_complaints_30d    0.017272
                 age   -0.012107
              tenure    0.000139
           bank_diff    0.000029
