In [3]:
# Load stored variables
%store -r X_resampled y_resampled y_test X_test

----
#### Building a model with Logistic Regression
----

In [27]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Convert the DataFrame and Series to numpy arrays if they were loaded with %store
X_resampled = np.array(X_resampled)
y_resampled = np.array(y_resampled)
y_test = np.array(y_test)
X_test = np.array(X_test)

# Train the model
clf = LogisticRegression(random_state=99, max_iter=500)
clf.fit(X_resampled, y_resampled)

# Predictions
y_pred = clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.67      0.77      1593
           1       0.36      0.73      0.48       407

    accuracy                           0.68      2000
   macro avg       0.63      0.70      0.63      2000
weighted avg       0.80      0.68      0.71      2000



----
#### Key Observations
- High Precision for Non-Churners (0.91). When the model predicts a customer won't churn (0), it's usually correct.
- Low Precision for Churners (0.36). When the model predicts churn (1), it's often wrong.
- Good Recall for Churners (0.73). The model catches most churners, which is important for retention strategies.
- Overall Accuracy is Only 68%
- Misclassifying churners could lead to missed retention opportunities
----

In [29]:
from sklearn.metrics import precision_recall_curve

y_scores = clf.predict_proba(X_test)[:, 1]  # Get probabilities for class 1
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Choosing a new threshold (e.g., where precision & recall are balanced)
optimal_threshold = 0.4  

# Apply the new threshold
y_pred_adjusted = (y_scores >= optimal_threshold).astype(int)

# Evaluate again
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       0.93      0.55      0.69      1593
           1       0.32      0.83      0.46       407

    accuracy                           0.61      2000
   macro avg       0.62      0.69      0.58      2000
weighted avg       0.80      0.61      0.64      2000



----
#### Lowering the threshold increases recall, as we capture more churners, but precision worsens (we capture more false positives).
----

In [31]:
# Choosing a new threshold
optimal_threshold = 0.6  

# Apply the new threshold
y_pred_adjusted = (y_scores >= optimal_threshold).astype(int)

# Evaluate again
print(classification_report(y_test, y_pred_adjusted))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81      1593
           1       0.39      0.62      0.48       407

    accuracy                           0.73      2000
   macro avg       0.64      0.69      0.65      2000
weighted avg       0.78      0.73      0.75      2000



----
#### Increasing the threshold decreases recall, as we capture fewer churners, but precision improves (resulting in fewer false positives)



#### Precision is not sufficient even when recall is lowered. A new model is needed.
----