In [3]:
# Step 1: Import necessary libraries and load the dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
import shap
import joblib

%pip install kaggle

# Unzip the file and load the dataset
import zipfile
import os

# Load the dataset
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

# Display basic information about the dataset
print("\
Dataset Info:")
print(df.info())

# Display first few rows
print("\
First few rows of the dataset:")
print(df.head())

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-n

In [4]:
# Data Preprocessing
# 1. Handle missing values
print("Missing values in the dataset:")
print(df.isnull().sum())

# Convert TotalCharges to numeric, handling any spaces
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Fill missing values
df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)

# 2. Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
le = LabelEncoder()

# Remove customerID as it's not needed for modeling
df_processed = df.drop('customerID', axis=1)

# Encode categorical variables
for column in categorical_columns:
    if column != 'customerID':
        df_processed[column] = le.fit_transform(df_processed[column])

# 3. Split features and target
X = df_processed.drop('Churn', axis=1)
y = df_processed['Churn']

# 4. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\
Shape of training data:", X_train_scaled.shape)
print("Shape of testing data:", X_test_scaled.shape)

# Save preprocessed data
joblib.dump(scaler, 'scaler.pkl')
print("\
Preprocessing complete and scaler saved.")

Missing values in the dataset:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64
Shape of training data: (5634, 19)
Shape of testing data: (1409, 19)
Preprocessing complete and scaler saved.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['MonthlyCharges'], inplace=True)


In [None]:
# Train and evaluate models
# 1. Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_pred = rf_model.predict(X_test_scaled)
rf_score = roc_auc_score(y_test, rf_model.predict_proba(X_test_scaled)[:, 1])

# 2. XGBoost
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_pred = xgb_model.predict(X_test_scaled)
xgb_score = roc_auc_score(y_test, xgb_model.predict_proba(X_test_scaled)[:, 1])

# 3. Logistic Regression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train_scaled, y_train)
lr_pred = lr_model.predict(X_test_scaled)
lr_score = roc_auc_score(y_test, lr_model.predict_proba(X_test_scaled)[:, 1])

# Print model performance
print("Model Performance (ROC-AUC Scores):")
print(f"Random Forest: {rf_score:.4f}")
print(f"XGBoost: {xgb_score:.4f}")
print(f"Logistic Regression: {lr_score:.4f}")

# Save the best model (assuming Random Forest performs best)
joblib.dump(rf_model, 'best_model.pkl')

# Feature Importance Analysis
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# SHAP Values for model interpretability
explainer = shap.TreeExplainer(rf_model)
shap_values = explainer.shap_values(X_test_scaled)

# Plot SHAP summary
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test_scaled, feature_names=X.columns, show=False)
plt.tight_layout()
plt.savefig('shap_summary.png')
plt.close()

print("\
Feature importance and SHAP analysis plots have been saved.")

# Print detailed classification report for the best model (Random Forest)
print("\
Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))

Model Performance (ROC-AUC Scores):
Random Forest: 0.8344
XGBoost: 0.8386
Logistic Regression: 0.8614
