<a href="https://colab.research.google.com/github/naolloan/end-to-end-churn-ml-pipeline-and-fastApi-backend/blob/main/end_to_end_churn_ml_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import joblib

In [10]:
# 1. Create a Realistic Synthetic Dataset
np.random.seed(42)
data_size = 2000

# Generate features
usage = np.random.randint(1, 100, data_size)
support_calls = np.random.randint(0, 15, data_size)
age = np.random.randint(18, 70, data_size)
sub_type = np.random.choice(['Basic', 'Standard', 'Premium'], data_size)
gender = np.random.choice(['M', 'F'], data_size)

In [11]:
# 2. CREATE LOGIC: Churn is NOT random anymore
# We create a 'churn_score'. If it's high, the customer churns.
# Logic: + score for support calls, - score for usage, + score for Basic plans
sub_map = {'Basic': 2, 'Standard': 0, 'Premium': -2}
sub_weights = np.array([sub_map[s] for s in sub_type])

churn_score = (support_calls * 1.5) - (usage * 0.1) + sub_weights + np.random.normal(0, 2, data_size)

# If score > 2, Churn = 1 (Likely to leave)
y = (churn_score > 2).astype(int)

df = pd.DataFrame({
    'Usage_Hours': usage,
    'Subscription_Type': sub_type,
    'Age': age,
    'Support_Calls': support_calls,
    'Gender': gender,
    'Churn': y
})

print("--- Dataset Sample ---")
print(df.head())
print("\nChurn Distribution (0=Stay, 1=Churn):")
print(df['Churn'].value_counts())

--- Dataset Sample ---
   Usage_Hours Subscription_Type  Age  Support_Calls Gender  Churn
0           52          Standard   56              8      M      1
1           93           Premium   19             12      M      1
2           15             Basic   46             13      F      1
3           72             Basic   58              0      F      0
4           61           Premium   27             12      M      1

Churn Distribution (0=Stay, 1=Churn):
Churn
1    1325
0     675
Name: count, dtype: int64


In [12]:
# 3. Setup Preprocessing Pipeline
X = df.drop('Churn', axis=1)
y = df['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

num_features = ['Usage_Hours', 'Age', 'Support_Calls']
cat_features = ['Subscription_Type', 'Gender']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
])

In [13]:
# 4. Define and Train Pipelines
# We wrap everything in a Pipeline so that 'Usage_Hours' etc. are
# automatically scaled and encoded when we call .predict() in FastAPI.
log_model = Pipeline([
    ('pre', preprocessor),
    ('clf', LogisticRegression())
])

tree_model = Pipeline([
    ('pre', preprocessor),
    ('clf', DecisionTreeClassifier(max_depth=5))
])

print("\nTraining models...")
log_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)


Training models...


In [14]:
# 5. Evaluate
print("\n--- Logistic Regression Report ---")
print(classification_report(y_test, log_model.predict(X_test)))

# 6. Export for Deployment
joblib.dump(log_model, 'logistic_regression_model.joblib')
joblib.dump(tree_model, 'decision_tree_model.joblib')

print("\n✅ SUCCESS: New 'Smart' models exported as .joblib files!")


--- Logistic Regression Report ---
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       136
           1       0.95      0.93      0.94       264

    accuracy                           0.93       400
   macro avg       0.91      0.92      0.92       400
weighted avg       0.93      0.93      0.93       400


✅ SUCCESS: New 'Smart' models exported as .joblib files!
