# Baseline Model: Logistic Regression

In [41]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, f1_score

# Load data
df_train = pd.read_csv('../../data/train_2025-01-30_19-17-30.csv')[['churn_status', 'customer_id', 'tenure_bucket', 'days_since_transfer', 'is_unemployed', 'risk_flag']].fillna(0)
df_test = pd.read_csv('../../data/test_2025-01-30_19-17-32.csv')[['churn_status', 'customer_id', 'tenure_bucket', 'days_since_transfer', 'is_unemployed', 'risk_flag']].fillna(0)

# Define features and target
X_train = df_train.drop(columns=['churn_status', 'customer_id'])
y_train = df_train['churn_status']

X_test = df_test.drop(columns=['churn_status', 'customer_id'])
y_test = df_test['churn_status']

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Grid search hyperparameter tuning
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'class_weight': [None, 'balanced'],
    'solver': ['liblinear']
}

grid_search = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_scaled, y_train)
best_model = grid_search.best_estimator_

# Generate predictions with best model
y_pred = best_model.predict(X_test_scaled)
y_pred_proba = best_model.predict_proba(X_test_scaled)[:, 1]

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print results
print(f"Best Parameters: {grid_search.best_params_}\n")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters: {'C': 0.001, 'class_weight': None, 'penalty': 'l1', 'solver': 'liblinear'}

Accuracy: 0.7485
Precision: 0.2506
Recall: 0.3421
F1 Score: 0.2893
AUC: 0.6497

Confusion Matrix:
[[115164  25280]
 [ 16257   8454]]

Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.82      0.85    140444
        True       0.25      0.34      0.29     24711

    accuracy                           0.75    165155
   macro avg       0.56      0.58      0.57    165155
weighted avg       0.78      0.75      0.76    165155



In [42]:
# Feature importance
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': best_model.coef_[0]
})
feature_importance = feature_importance.sort_values(by='Importance', key=lambda x: abs(x), ascending=False)

print("\nFeature Importance:")
print(feature_importance.to_string(index=False))


Feature Importance:
            Feature  Importance
      tenure_bucket   -0.630786
days_since_transfer    0.539868
      is_unemployed   -0.016565
          risk_flag   -0.006924
