In [51]:
import pickle
import joblib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [39]:
with open("artifacts/label_encoders.pkl", 'rb') as f:
    feature_columns = pickle.load(f)

In [40]:
kmeans_model = joblib.load("artifacts/kmeans_model.pkl")

In [41]:
with open("calibrated_model_cluster_01_CatBoost.pkl", 'rb') as f:
    calibrated_cat = pickle.load(f)

In [42]:
with open("calibrated_model_cluster_2_LR.pkl", 'rb') as f:
    calibrated_lr = pickle.load(f)

In [43]:
with open("scaler_cluster_2.pkl", 'rb') as f:
    scaler_cluster_2 = pickle.load(f)

print("All artifacts loaded")

All artifacts loaded


In [44]:
val_df = pd.read_csv('validation.csv')
print(val_df.shape)
val_df.head(2)

(705, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0979-PHULV,Male,0,Yes,Yes,69,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Credit card (automatic),99.45,7007.6,Yes
1,8390-FESFV,Female,0,No,No,62,Yes,Yes,DSL,No,...,Yes,Yes,Yes,Yes,Two year,No,Bank transfer (automatic),84.5,5193.2,No


In [45]:
val_df['tenure_bucket'] = np.where(val_df['tenure'] <= 6, 1,
                        np.where(val_df['tenure'] <= 20, 2,
                        np.where(val_df['tenure'] <= 50, 3,
                        4)))

In [46]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 22 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        705 non-null    object 
 1   gender            705 non-null    object 
 2   SeniorCitizen     705 non-null    int64  
 3   Partner           705 non-null    object 
 4   Dependents        705 non-null    object 
 5   tenure            705 non-null    int64  
 6   PhoneService      705 non-null    object 
 7   MultipleLines     705 non-null    object 
 8   InternetService   705 non-null    object 
 9   OnlineSecurity    705 non-null    object 
 10  OnlineBackup      705 non-null    object 
 11  DeviceProtection  705 non-null    object 
 12  TechSupport       705 non-null    object 
 13  StreamingTV       705 non-null    object 
 14  StreamingMovies   705 non-null    object 
 15  Contract          705 non-null    object 
 16  PaperlessBilling  705 non-null    object 
 1

In [47]:
val_df['TotalCharges'] = np.where((val_df['TotalCharges'] == " ") | (val_df['TotalCharges'].isna()), 0, val_df['TotalCharges'])
val_df['TotalCharges'] = pd.to_numeric(val_df['TotalCharges'])

In [48]:
val_df_processed = val_df.drop(columns=['customerID', 'Churn', 'tenure'], errors='ignore')

In [49]:
top_feature_names = ['Contract',
 'TotalCharges',
 'MonthlyCharges',
 'OnlineSecurity',
 'tenure_bucket',
 'TechSupport',
 'PaymentMethod',
 'InternetService',
 'MultipleLines',
 'PaperlessBilling']

In [50]:
val_df_processed = val_df_processed[top_feature_names]
val_df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705 entries, 0 to 704
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Contract          705 non-null    object 
 1   TotalCharges      705 non-null    float64
 2   MonthlyCharges    705 non-null    float64
 3   OnlineSecurity    705 non-null    object 
 4   tenure_bucket     705 non-null    int64  
 5   TechSupport       705 non-null    object 
 6   PaymentMethod     705 non-null    object 
 7   InternetService   705 non-null    object 
 8   MultipleLines     705 non-null    object 
 9   PaperlessBilling  705 non-null    object 
dtypes: float64(2), int64(1), object(7)
memory usage: 55.2+ KB


In [52]:
val_df_dummy = val_df_processed.copy()
cate_cols = val_df_dummy.select_dtypes(include='object').columns

le = LabelEncoder()
for col in cate_cols:
    val_df_dummy[col] = le.fit_transform(val_df_dummy[col])

In [53]:
val_df_dummy.head(2)

Unnamed: 0,Contract,TotalCharges,MonthlyCharges,OnlineSecurity,tenure_bucket,TechSupport,PaymentMethod,InternetService,MultipleLines,PaperlessBilling
0,0,7007.6,99.45,0,4,0,1,1,2,1
1,2,5193.2,84.5,0,4,2,0,0,2,0


In [54]:
val_df_dummy.shape

(705, 10)

In [58]:
X_val_full = val_df_dummy.reindex(columns=top_feature_names, fill_value=0)

In [59]:
X_val_full.shape

(705, 10)

In [60]:
val_df['cohort_cluster'] = kmeans_model.predict(X_val_full)
print("Clusters assigned ✅")

Clusters assigned ✅




In [61]:
probas = []

X_val_scaled_cluster_2 = scaler_cluster_2.transform(X_val_full)

for idx, row in X_val_full.iterrows():
    cluster = val_df.loc[idx, 'cohort_cluster']
    row_array = row.values.reshape(1, -1)

    if cluster == 0:
        proba = calibrated_cat.predict_proba(row_array)[0,1]
    else:
        row_scaled = X_val_scaled_cluster_2[idx, :].reshape(1, -1)
        proba = calibrated_lr.predict_proba(row_scaled)[0,1]

    probas.append(proba)

val_df['pred_proba'] = probas

In [62]:
val_df['risk_bucket'] = pd.qcut(val_df['pred_proba'], q=4, labels=['Low', 'Moderate', 'High', 'Very High'])

In [63]:
final_columns = ['customerID', 'cohort_cluster', 'pred_proba', 'risk_bucket']
available_cols = [col for col in final_columns if col in val_df.columns]

print(val_df[available_cols].head())

print("Full val_df pipeline complete!")

   customerID  cohort_cluster  pred_proba risk_bucket
0  0979-PHULV               1    0.649964   Very High
1  8390-FESFV               1    0.122277         Low
2  1346-UFHAX               1    0.739628   Very High
3  1741-WTPON               1    0.384944        High
4  0410-IPFTY               1    0.572599        High
Full val_df pipeline complete!


In [65]:
val_df['risk_bucket'].value_counts()

risk_bucket
Low          177
Moderate     176
High         176
Very High    176
Name: count, dtype: int64