In [13]:
import pandas as pd

df = pd.read_csv("customer_features.csv")
df

Unnamed: 0,customer_id,total_orders,total_revenue,avg_order_value,last_purchase_date,days_since_last_purchase,churn_flag
0,1,24,2804.0,116.833333,2023-06-20,972,0
1,3,8,1600.0,200.0,2023-01-12,1131,1
2,4,32,2640.0,82.5,2023-08-15,916,0
3,2,16,1200.0,75.0,2023-03-25,1059,1


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = df[[
    "total_orders",
    "total_revenue",
    "avg_order_value",
    "days_since_last_purchase"
]]

y = df["churn_flag"]

model = LogisticRegression()
model.fit(X, y)

print("Model trained successfully")

Model trained successfully


In [15]:
importance = pd.DataFrame({
    "feature":X.columns,
    "Importance":model.coef_[0]
}).sort_values(by="Importance", ascending=False)

importance

Unnamed: 0,feature,Importance
3,days_since_last_purchase,0.026652
2,avg_order_value,0.006855
0,total_orders,-0.000594
1,total_revenue,-0.013508


In [16]:
df["churn_probability"] = model.predict_proba(X)[:, 1]
df

Unnamed: 0,customer_id,total_orders,total_revenue,avg_order_value,last_purchase_date,days_since_last_purchase,churn_flag,churn_probability
0,1,24,2804.0,116.833333,2023-06-20,972,0,1.4e-05
1,3,8,1600.0,200.0,2023-01-12,1131,1,0.99995
2,4,32,2640.0,82.5,2023-08-15,916,0,2.3e-05
3,2,16,1200.0,75.0,2023-03-25,1059,1,0.999996


In [17]:
def risk_segment(p):
  if p > 0.7:
    return "High Risk"
  elif p > 0.4:
    return "Medium Risk"
  else:
    return "Low Risk"

df["risk_segment"] = df["churn_probability"].apply(risk_segment)

df

Unnamed: 0,customer_id,total_orders,total_revenue,avg_order_value,last_purchase_date,days_since_last_purchase,churn_flag,churn_probability,risk_segment
0,1,24,2804.0,116.833333,2023-06-20,972,0,1.4e-05,Low Risk
1,3,8,1600.0,200.0,2023-01-12,1131,1,0.99995,High Risk
2,4,32,2640.0,82.5,2023-08-15,916,0,2.3e-05,Low Risk
3,2,16,1200.0,75.0,2023-03-25,1059,1,0.999996,High Risk


In [18]:
# Feature importance

from sklearn.inspection import permutation_importance
import pandas as pd

r = permutation_importance(model, X, y, n_repeats=50, random_state=42)

importance = pd.DataFrame({
    'feature': X.columns,
    'importance': r.importances_mean
}).sort_values("importance", ascending=False)

importance

Unnamed: 0,feature,importance
1,total_revenue,0.46
0,total_orders,0.0
2,avg_order_value,0.0
3,days_since_last_purchase,0.0


In [19]:
# Quick fix
pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values("coefficient", key=abs, ascending=False)

Unnamed: 0,feature,coefficient
3,days_since_last_purchase,0.026652
1,total_revenue,-0.013508
2,avg_order_value,0.006855
0,total_orders,-0.000594


In [20]:
# Generate realistic dataset
import numpy as np
import pandas as pd

np.random.seed(42)

n_customers = 1000

df = pd.DataFrame({
    "customer_id": range(1, n_customers + 1),
    "total_orders": np.random.poisson(12, n_customers),
    "avg_order_value": np.random.normal(80, 20, n_customers).clip(10, 200),
    "days_since_last_purchase": np.random.exponential(120, n_customers).astype(int)
})

df["total_revenue"] = df["total_orders"] * df["avg_order_value"]

# Realistic churn rule (behavioral)
df["churn_flag"] = (
    (df["days_since_last_purchase"] > 180) |
    (df["total_orders"] < 5)
).astype(int)

df.head()

Unnamed: 0,customer_id,total_orders,avg_order_value,days_since_last_purchase,total_revenue,churn_flag
0,1,11,69.11772,64,760.29492,0
1,2,14,76.744142,20,1074.417986,0
2,3,8,80.818384,22,646.547068,0
3,4,13,59.956251,64,779.431261,0
4,5,16,94.816487,169,1517.063791,0


In [21]:
# Train real model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X = df[[
    "total_orders",
    "total_revenue",
    "avg_order_value",
    "days_since_last_purchase"
]]

y = df["churn_flag"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state = 42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [22]:
# Churn probability
df["churn_probability"] = model.predict_proba(X)[:, 1]

In [23]:
# Feature importance
pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values("coefficient", key=abs, ascending=False)

Unnamed: 0,feature,coefficient
0,total_orders,0.193822
2,avg_order_value,0.070363
3,days_since_last_purchase,0.057993
1,total_revenue,-0.005908


In [24]:
# Permutation importance
from sklearn.inspection import permutation_importance

r = permutation_importance(model, X, y, n_repeats=20, random_state=42)

importance = pd.DataFrame({
    "feature": X.columns,
    "importance": r.importances_mean
}).sort_values("importance", ascending=False)

importance

Unnamed: 0,feature,importance
3,days_since_last_purchase,0.3133
1,total_revenue,0.0523
2,avg_order_value,0.03405
0,total_orders,0.0191


In [25]:
# ROC/Model quality (production metric)
from sklearn.metrics import roc_auc_score

y_pred_prob = model.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred_prob)

np.float64(0.9916530575213209)

In [26]:
final_df = df[[
    "customer_id",
    "churn_probability",
    "churn_flag"
]].copy()

final_df["risk_segment"] = final_df["churn_probability"].apply(
    lambda x: "High Risk" if x > 0.5 else "Low Risk"
    )

final_df.to_csv("churn_predictions.csv", index=False)