In [81]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
!pip install -r requirements.txt

In [82]:
# Load dataset
df = pd.read_excel('dataset/CustomerChurn.xlsx', sheet_name='WA_Fn-UseC_-Telco-Customer-Chur')

# Drop unnecessary columns
df.drop(['Customer ID', 'LoyaltyID'], axis=1, inplace=True)

# Convert 'Total Charges' to numeric
df['Total Charges'] = pd.to_numeric(df['Total Charges'], errors='coerce')

# Drop rows with missing Total Charges
df.dropna(inplace=True)

# Convert 'Senior Citizen' Yes/No to 1/0
df['Senior Citizen'] = df['Senior Citizen'].map({'Yes': 1, 'No': 0})


In [83]:
# Separate features and target
X = df.drop('Churn', axis=1)
y = df['Churn'].map({'Yes': 1, 'No': 0})  # Encode target

# First split into train and temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Then split temp into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.05, random_state=42, stratify=y_temp
)

# Merge features and labels
train_set = pd.concat([X_train, y_train], axis=1)
val_set = pd.concat([X_val, y_val], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

# Save to CSV
train_set.to_csv('dataset/train_data.csv', index=False)
val_set.to_csv('dataset/validation_data.csv', index=False)
test_set.to_csv('dataset/test_data.csv', index=False)

print("Datasets saved successfully!")


Datasets saved successfully!


In [84]:
# Load train, validation, test from CSV
train_df = pd.read_csv('dataset/train_data.csv')
val_df = pd.read_csv('dataset/validation_data.csv')
test_df = pd.read_csv('dataset/test_data.csv')

# Separate features and labels again
X_train = train_df.drop('Churn', axis=1)
y_train = train_df['Churn']

X_val = val_df.drop('Churn', axis=1)
y_val = val_df['Churn']

X_test = test_df.drop('Churn', axis=1)
y_test = test_df['Churn']


In [85]:
# Identify numeric and categorical columns
num_features = ['Tenure', 'Monthly Charges', 'Total Charges']
cat_features = [col for col in X_train.columns if col not in num_features]

# Pipelines
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [86]:
from sklearn.ensemble import RandomForestClassifier

model = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=300,            # more trees
        max_depth=15,                # limit depth to prevent overfitting
        min_samples_split=10,        # need at least 10 samples to split
        min_samples_leaf=4,          # at least 4 samples at a leaf
        max_features='sqrt',         # try sqrt(number of features) at each split
        bootstrap=True,              # sample with replacement
        random_state=42
    ))
])

# Train the model
model.fit(X_train, y_train)



In [87]:
# Predict on Validation set
y_val_pred = model.predict(X_val)

# Validation Evaluation
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {val_accuracy:.4f}")

print("Classification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))


Validation Accuracy: 0.7874
Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       981
           1       0.63      0.50      0.55       355

    accuracy                           0.79      1336
   macro avg       0.73      0.70      0.71      1336
weighted avg       0.78      0.79      0.78      1336

Confusion Matrix:
[[875 106]
 [178 177]]


In [88]:
# Predict on Test set
y_test_pred = model.predict(X_test)

# Test Evaluation
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")

print("Classification Report on Test Set:")
print(classification_report(y_test, y_test_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))


Test Accuracy: 0.8592
Classification Report on Test Set:
              precision    recall  f1-score   support

           0       0.89      0.92      0.91        52
           1       0.76      0.68      0.72        19

    accuracy                           0.86        71
   macro avg       0.83      0.80      0.81        71
weighted avg       0.86      0.86      0.86        71

Confusion Matrix:
[[48  4]
 [ 6 13]]
