In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, precision_score, mean_absolute_error, r2_score
from sklearn.metrics import roc_curve, precision_recall_curve
from imblearn.over_sampling import SMOTE
#import xgboost as xgb
#import shap
import matplotlib.pyplot as plt
from datetime import datetime



In [2]:
# At this point there is no need of using this particular dataset because you have already enriched your dataset when you worked on the recency,
# frequency and monetary. So stick to that dataset.
riders_data = pd.read_csv('riders.csv')
riders_rfm = pd.read_csv('riders_rfm.csv')
df = pd.merge(riders_rfm, riders_data[['user_id', 'churn_prob','age','loyalty_status','city']], on='user_id')

In [3]:
riders_data.head()

Unnamed: 0,user_id,signup_date,loyalty_status,age,city,avg_rating_given,churn_prob,referred_by
0,R00000,2025-01-24,Bronze,34.729629,Nairobi,5.0,0.142431,R00001
1,R00001,2024-09-09,Bronze,34.57102,Nairobi,4.7,0.674161,
2,R00002,2024-09-07,Bronze,47.13396,Lagos,4.2,0.510379,
3,R00003,2025-03-17,Bronze,41.658628,Nairobi,4.9,0.244779,
4,R00004,2024-08-20,Silver,40.681709,Lagos,3.9,0.26996,R00002


In [4]:
riders_rfm.head()

Unnamed: 0,user_id,recency,frequency,monetary
0,R00000,48,25,366.05
1,R00001,28,14,180.53
2,R00002,37,24,378.99
3,R00003,84,9,121.47
4,R00004,35,16,268.43


In [5]:
df.head()

Unnamed: 0,user_id,recency,frequency,monetary,churn_prob,age,loyalty_status,city
0,R00000,48,25,366.05,0.142431,34.729629,Bronze,Nairobi
1,R00001,28,14,180.53,0.674161,34.57102,Bronze,Nairobi
2,R00002,37,24,378.99,0.510379,47.13396,Bronze,Lagos
3,R00003,84,9,121.47,0.244779,41.658628,Bronze,Nairobi
4,R00004,35,16,268.43,0.26996,40.681709,Silver,Lagos


In [6]:
# Create target column
df['churned'] = (df['churn_prob'] > 0.5).astype(int)
df = df.drop(columns=['churn_prob'])

target_column = 'churned'
X = df.drop(columns=[target_column, 'user_id'])
y = df[target_column]

In [7]:
df.head()

Unnamed: 0,user_id,recency,frequency,monetary,age,loyalty_status,city,churned
0,R00000,48,25,366.05,34.729629,Bronze,Nairobi,0
1,R00001,28,14,180.53,34.57102,Bronze,Nairobi,1
2,R00002,37,24,378.99,47.13396,Bronze,Lagos,1
3,R00003,84,9,121.47,41.658628,Bronze,Nairobi,0
4,R00004,35,16,268.43,40.681709,Silver,Lagos,0


In [8]:
categorical_columns = ['loyalty_status','city']
numerical_columns =['recency','frequency','monetary','age',]
            
#Build a pipeline for numerical features:
# Impute missing values using the median.
# Scale the features using StandardScaler.
numerical_pipeline = Pipeline(
    steps=[
        ("imputer",SimpleImputer(strategy="median")), ## Handling Missing Values
        ("scaler",MinMaxScaler()),  # Use MinMaxScaler to keep values non-negative
        ])
# Build a pipeline for categorical features:
# Impute missing values using the most frequent value.
# One-hot encode categorical features.
categorical_pipeline = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("OneHotEncoder", OneHotEncoder())
            ])

# Combine both pipelines into a single ColumnTransformer.
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipeline", numerical_pipeline, numerical_columns),
        ("cat_pipeline", categorical_pipeline, categorical_columns)
        ])


In [9]:
X_transformed = preprocessor.fit_transform(X)
## Retrieve column names generated by the OneHotEncoder for categorical features.
ohe = preprocessor.named_transformers_["cat_pipeline"].named_steps["OneHotEncoder"]
ohe_feature_names = list(ohe.get_feature_names_out(categorical_columns))

# Combine numeric columns with one-hot encoded categorical column names.
final_columns = numerical_columns + ohe_feature_names

# Construct a new DataFrame with the transformed features.
X_transformed_df = pd.DataFrame(X_transformed, columns=final_columns)


In [10]:
#Apply SMOTE for balancing classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_transformed_df, y)


In [11]:
print(f"After SMOTE - Class 1: {sum(y_resampled == 1)}, Class 0: {sum(y_resampled == 0)}")

After SMOTE - Class 1: 8937, Class 0: 8937


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)


In [13]:
y_train.head()

10031    1
6172     0
2190     0
10375    1
2514     0
Name: churned, dtype: int64

In [14]:
log_reg = LogisticRegression(random_state=42,max_iter=1000)
# Fit the model
log_reg.fit(X_train, y_train)
# Predictions
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
    
# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)

print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")

AUC: 0.524
Precision: 0.511


In [15]:
# Predictions
y_pred = log_reg.predict(X_test)
y_pred_proba = log_reg.predict_proba(X_test)[:, 1]
    
# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)

print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")


AUC: 0.524
Precision: 0.511


In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [17]:
random_forest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=1)
# Fit the model
random_forest.fit(X_train, y_train)
# Predictions
y_pred = random_forest.predict(X_test)
y_pred_proba = random_forest.predict_proba(X_test)[:, 1]
    
# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)

print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")


AUC: 0.968
Precision: 0.914


In [18]:
gradient_boosting = GradientBoostingClassifier(n_estimators=100)
# Fit the model
gradient_boosting.fit(X_train, y_train)
# Predictions
y_pred = gradient_boosting.predict(X_test)
y_pred_proba = gradient_boosting.predict_proba(X_test)[:, 1]
    
# Metrics
auc = roc_auc_score(y_test, y_pred_proba)
precision = precision_score(y_test, y_pred)

print(f"AUC: {auc:.3f}")
print(f"Precision: {precision:.3f}")

AUC: 0.916
Precision: 0.998


In [19]:
## MY EDITING ENDS HERE

In [20]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc as auc_func, precision_score, precision_recall_curve

# Get the best XGBoost model from GridSearchCV
best_xgb = xgb_model.best_estimator_

# Predict probabilities for the validation set
y_pred_proba = best_xgb.predict_proba(X_val)[:, 1]  # Probability for positive class
y_pred = best_xgb.predict(X_val)  # Predicted labels

# 1. ROC Curve
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
roc_auc = auc_func(fpr, tpr)  # Use auc_func to avoid naming conflict

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

NameError: name 'xgb_model' is not defined

In [None]:
# 2. Precision Score
precision = precision_score(y_val, y_pred)
print(f"Precision Score: {precision:.4f}")

# Optional: Precision-Recall Curve
precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='purple', lw=2, label='Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='lower left')
plt.grid(True)
plt.show()

In [None]:
# Evaluate models
models = {'Logistic Regression': log_reg, 'XGBoost': xgb_model}
metrics = {}
roc_data = {}
pr_data = {}

for name, model in models.items():
    # Predictions
    y_pred = model.predict(X_val)
    y_pred_proba = model.predict_proba(X_val)[:, 1]
    
    # Metrics
    auc = roc_auc_score(y_val, y_pred_proba)
    precision = precision_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)
    metrics[name] = {'AUC': auc, 'Precision': precision, 'MAE': mae, 'R2': r2}
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
    roc_data[name] = pd.DataFrame({'FPR': fpr, 'TPR': tpr})
    
    # PR Curve
    precision_vals, recall, _ = precision_recall_curve(y_val, y_pred_proba)
    pr_data[name] = pd.DataFrame({'Precision': precision_vals, 'Recall': recall})

# Print metrics
print("\nModel Performance Metrics:")
for name, metric in metrics.items():
    print(f"\n{name}:")
    print(f"AUC: {metric['AUC']:.3f} {'✅' if metric['AUC'] > 0.85 else '❌'}")
    print(f"Precision: {metric['Precision']:.3f} {'✅' if metric['Precision'] > 0.8 else '❌'}")
    print(f"MAE: {metric['MAE']:.3f}")
    print(f"R2: {metric['R2']:.3f}")

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
for name in pr_data:
    plt.plot(pr_data[name]['Recall'], pr_data[name]['Precision'], label=name)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True)
plt.savefig('pr_curves_improved.png')  # Save the plot
plt.show()  # Display the plot in the notebook
plt.close()  # Close the figure to free memory

In [None]:
%matplotlib inline 
# Plot PR Curves
plt.figure(figsize=(8, 6))
for name in pr_data:
    plt.plot(pr_data[name]['Recall'], pr_data[name]['Precision'], label=name)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curves')
plt.legend()
plt.grid(True)
plt.savefig('pr_curves_improved.png')
plt.close()


In [None]:
# SHAP Feature Importance for Logistic Regression
explainer_log_reg = shap.LinearExplainer(best_log_reg, X_train_smote)
shap_values_log_reg = explainer_log_reg.shap_values(X_train_smote)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values_log_reg, X_train_smote, feature_names=X.columns, show=False)
plt.title('SHAP Feature Importance - Logistic Regression')
plt.savefig('shap_log_reg_improved.png')
plt.close()

# SHAP Feature Importance for XGBoost
explainer_xgb = shap.TreeExplainer(best_xgb)
shap_values_xgb = explainer_xgb.shap_values(X_train_smote)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values_xgb, X_train_smote, feature_names=X.columns, show=False)
plt.title('SHAP Feature Importance - XGBoost')
plt.savefig('shap_xgb_improved.png')
plt.close()

# Print interesting fact
print("\n", interesting_fact)