In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,r2_score,precision_score,recall_score,f1_score

In [3]:
data=pd.read_csv("../dataset/churn-bigml-80.csv")
data.head(10)

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
5,AL,118,510,Yes,No,0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6,1.7,0,False
6,MA,121,510,No,Yes,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3,False
7,MO,147,415,Yes,No,0,157.0,79,26.69,103.1,94,8.76,211.8,96,9.53,7.1,6,1.92,0,False
8,WV,141,415,Yes,Yes,37,258.6,84,43.96,222.0,111,18.87,326.4,97,14.69,11.2,5,3.02,0,False
9,RI,74,415,No,No,0,187.7,127,31.91,163.4,148,13.89,196.0,94,8.82,9.1,5,2.46,0,False


In [4]:
data=data.drop(["State","Area code"],axis=1)

In [6]:
label=LabelEncoder()
encoded_internatinal_plan=label.fit_transform(data["International plan"])
encoded_voice_mail_plan=label.fit_transform(data["Voice mail plan"])
data["Churn"]=data["Churn"].astype(int)
data=data.drop(["International plan","Voice mail plan"], axis=1)
encoded_df = pd.DataFrame(
    {
        "International plan": encoded_internatinal_plan,
        "Voice mail plan": encoded_voice_mail_plan
    }
)
data=pd.concat([data,encoded_df],axis=1)

In [7]:
X=data.drop('Churn',axis=1)
y=data["Churn"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2, random_state=42)

In [8]:
from imblearn.over_sampling import SMOTE
smote=SMOTE()
x_train_smote,y_train_smote=smote.fit_resample(X_train,y_train)

In [9]:
rf_model=RandomForestClassifier(
    n_estimators=200, #number of trees
    random_state=42, #for reproducibility
    max_depth=7,
    min_samples_split=10,
    min_samples_leaf=3,
    max_features='sqrt'
)

rf_model.fit(x_train_smote,y_train_smote)

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,7
,min_samples_split,10
,min_samples_leaf,3
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
def training_scores(y_train,y_pred):
    accuracy=round(accuracy_score(y_train,y_pred),3)
    precision=round(precision_score(y_train,y_pred),3)
    recall=round(recall_score(y_train,y_pred),3)
    f1=round(f1_score(y_train,y_pred),3)
    print(f"Training  scores:\n\tAccuracy={accuracy}\n\tPrecision={ precision}\n\tRecall={recall}\n\tF1_score={f1}")

def validating_scores(y_test_smote,y_pred):
    accuracy=round(accuracy_score(y_test_smote,y_pred),3)
    precision=round(precision_score(y_test_smote,y_pred),3)
    recall=round(recall_score(y_test_smote,y_pred),3)
    f1=round(f1_score(y_test_smote,y_pred),3)
    print(f"Validating  scores:\n\tAccuracy={accuracy}\n\tPrecision={ precision}\n\tRecall={recall}\n\tF1_score={f1}")

In [14]:
rf_pred_train=rf_model.predict(x_train_smote)
rf_test_pred=rf_model.predict(X_test)
training_scores(y_train_smote,rf_pred_train)
validating_scores(y_test,rf_test_pred)

Training  scores:
	Accuracy=0.896
	Precision=0.965
	Recall=0.822
	F1_score=0.888
Validating  scores:
	Accuracy=0.91
	Precision=0.707
	Recall=0.671
	F1_score=0.688


In [17]:
print("Random Forest Accuracy:", accuracy_score(y_test,rf_test_pred))
print("\nRandom Forest Report:\n", classification_report(y_test,rf_test_pred ))

Random Forest Accuracy: 0.9101123595505618

Random Forest Report:
               precision    recall  f1-score   support

           0       0.94      0.95      0.95       455
           1       0.71      0.67      0.69        79

    accuracy                           0.91       534
   macro avg       0.83      0.81      0.82       534
weighted avg       0.91      0.91      0.91       534



In [25]:
# Train Random Forest
rf_model.fit(x_train_smote, y_train_smote)

# Get predictions on training and test sets
rf_train_probs = rf_model.predict_proba(x_train_smote)[:, 1]
rf_test_probs = rf_model.predict_proba(X_test)[:, 1]

# Stack RF output as an extra feature
import numpy as np
x_train_hybrid = np.column_stack((x_train_smote, rf_train_probs))
x_test_hybrid = np.column_stack((X_test, rf_test_probs))

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_hybrid = scaler.fit_transform(x_train_hybrid)
x_test_hybrid = scaler.transform(x_test_hybrid)


# Train ANN on hybrid data
from sklearn.neural_network import MLPClassifier

ann_model = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu',
                          solver='adam', random_state=42, max_iter=500)

ann_model.fit(x_train_hybrid, y_train_smote)

# Evaluate
y_pred = ann_model.predict(x_test_hybrid)

print("Hybridization Accuracy:", accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))


Hybridization Accuracy: 0.8838951310861424
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       455
           1       0.59      0.68      0.64        79

    accuracy                           0.88       534
   macro avg       0.77      0.80      0.78       534
weighted avg       0.89      0.88      0.89       534



Compare with other boosting-based models

In [19]:
!pip install lightgbm catboost 

Collecting lightgbm
  Obtaining dependency information for lightgbm from https://files.pythonhosted.org/packages/5e/23/f8b28ca248bb629b9e08f877dd2965d1994e1674a03d67cd10c5246da248/lightgbm-4.6.0-py3-none-win_amd64.whl.metadata
  Downloading lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Collecting catboost
  Obtaining dependency information for catboost from https://files.pythonhosted.org/packages/bd/9e/feae59f6226f742fa3fa30ae126e0941f443d460e7c0fa9f79cdf3ee488f/catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata
  Downloading catboost-1.2.8-cp312-cp312-win_amd64.whl.metadata (1.5 kB)
Collecting graphviz (from catboost)
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl.metadata
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting plotly (from catboost)
  Obtaining dependency information for plotly from https://files.pyth


[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report


In [21]:
# Gradient Boosting
gb_model = GradientBoostingClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42
)
gb_model.fit(x_train_smote, y_train_smote)
gb_pred = gb_model.predict(X_test)

# LightGBM
lgb_model = LGBMClassifier(
    n_estimators=100, learning_rate=0.1, max_depth=5, subsample=0.8,
    colsample_bytree=0.8, random_state=42
)
lgb_model.fit(x_train_smote, y_train_smote)
lgb_pred = lgb_model.predict(X_test)

# CatBoost
cat_model = CatBoostClassifier(
    iterations=100, learning_rate=0.1, depth=5, verbose=0, random_seed=42
)
cat_model.fit(x_train_smote, y_train_smote)
cat_pred = cat_model.predict(X_test)

[LightGBM] [Info] Number of positive: 1823, number of negative: 1823
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000793 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2622
[LightGBM] [Info] Number of data points in the train set: 3646, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [22]:
models = {
    "Random Forest": (rf_model, rf_test_pred),
    "Gradient Boosting": (gb_model, gb_pred),
    "LightGBM": (lgb_model, lgb_pred),
    "CatBoost": (cat_model, cat_pred)
}

for name, (model, pred) in models.items():
    print(f"\n {name} Results")
    print("Accuracy:", accuracy_score(y_test, pred))
    print(classification_report(y_test, pred))



 Random Forest Results
Accuracy: 0.9101123595505618
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       455
           1       0.71      0.67      0.69        79

    accuracy                           0.91       534
   macro avg       0.83      0.81      0.82       534
weighted avg       0.91      0.91      0.91       534


 Gradient Boosting Results
Accuracy: 0.9176029962546817
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       455
           1       0.72      0.72      0.72        79

    accuracy                           0.92       534
   macro avg       0.84      0.84      0.84       534
weighted avg       0.92      0.92      0.92       534


 LightGBM Results
Accuracy: 0.9307116104868914
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       455
           1       0.77      0.76      0.76        79

    accuracy            