#  Capstone project - Telco Customer Churn dataset

# Importing Libraries

In [1]:
import xgboost as xgb
print(xgb.__version__)

2.1.2


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


# Load and Explore the Dataset

In [6]:
# Load dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

# Display first few rows
print(df.head())

# Check for null values and data types
print(df.info())
print(df.isnull().sum())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

# Data Preprocessing

In [8]:
# Drop irrelevant columns
df = df.drop(['customerID'], axis=1)

# Handle missing values in the TotalCharges column
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])

# Encoding categorical variables
df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Splitting features and target variable
X = df.drop('Churn', axis=1)
y = df['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling numeric features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model Training and Evaluation Functions

In [11]:
# Function to train and evaluate models
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    return {"Accuracy": accuracy, "Precision": precision, "Recall": recall, "F1 Score": f1, "ROC AUC": auc}

# Dictionary to store results
results = {}


# Model Training

In [13]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
results['Logistic Regression'] = evaluate_model(log_reg)

# Decision Tree
tree = DecisionTreeClassifier(random_state=42)
results['Decision Tree'] = evaluate_model(tree)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
results['Random Forest'] = evaluate_model(rf)

# XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
results['XGBoost'] = evaluate_model(xgb)

# SVM
svm = SVC(probability=True, random_state=42)
results['SVM'] = evaluate_model(svm)

# Naive Bayes
nb = GaussianNB()
results['Naive Bayes'] = evaluate_model(nb)

# K-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
results['KNN'] = evaluate_model(knn)


Parameters: { "use_label_encoder" } are not used.



# Displaying Results

In [15]:
# Convert results dictionary to DataFrame for easy viewing
results_df = pd.DataFrame(results).T
print("Model Evaluation Results:\n", results_df)

Model Evaluation Results:
                      Accuracy  Precision    Recall  F1 Score   ROC AUC
Logistic Regression  0.787491   0.620579  0.516043  0.563504  0.831924
Decision Tree        0.725657   0.485222  0.526738  0.505128  0.663843
Random Forest        0.784648   0.625442  0.473262  0.538813  0.815336
XGBoost              0.763326   0.565916  0.470588  0.513869  0.809691
SVM                  0.781095   0.616197  0.467914  0.531915  0.782421
Naive Bayes          0.657427   0.428760  0.868984  0.574205  0.810223
KNN                  0.751955   0.535211  0.508021  0.521262  0.765145


# Hyperparameter Tuning

In [18]:
from sklearn.model_selection import RandomizedSearchCV

# Dictionary to store best parameters for each model
best_params = {}

# 1. Logistic Regression
log_reg = LogisticRegression(random_state=42, max_iter=1000)
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']  # solvers that support both l1 and l2
}
grid_log_reg = RandomizedSearchCV(log_reg, param_grid_log_reg, cv=5, scoring='roc_auc', n_iter=10, random_state=42)
grid_log_reg.fit(X_train, y_train)
best_params['Logistic Regression'] = grid_log_reg.best_params_

# 2. Random Forest
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
grid_rf = RandomizedSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc', n_iter=10, random_state=42)
grid_rf.fit(X_train, y_train)
best_params['Random Forest'] = grid_rf.best_params_

# 3. XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
param_grid_xgb = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
grid_xgb = RandomizedSearchCV(xgb, param_grid_xgb, cv=5, scoring='roc_auc', n_iter=10, random_state=42)
grid_xgb.fit(X_train, y_train)
best_params['XGBoost'] = grid_xgb.best_params_

# Display the best hyperparameters for each model
print("Best Parameters for each model:")
for model_name, params in best_params.items():
    print(f"{model_name}: {params}")

import warnings

# Example of issuing a warning
warnings.warn("This is a warning message.", UserWarning)

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Mouna\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Mouna\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\Mouna\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\Mouna\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameter

Best Parameters for each model:
Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'C': 100}
Random Forest: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30}
XGBoost: {'subsample': 0.8, 'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.8}




In [20]:
# Train models with best parameters and evaluate
tuned_results = {}

# Logistic Regression with best parameters
log_reg_tuned = LogisticRegression(**best_params['Logistic Regression'], random_state=42)
tuned_results['Logistic Regression'] = evaluate_model(log_reg_tuned)

# Random Forest with best parameters
rf_tuned = RandomForestClassifier(**best_params['Random Forest'], random_state=42)
tuned_results['Random Forest'] = evaluate_model(rf_tuned)

# XGBoost with best parameters
xgb_tuned = XGBClassifier(**best_params['XGBoost'], use_label_encoder=False, eval_metric='logloss', random_state=42)
tuned_results['XGBoost'] = evaluate_model(xgb_tuned)

# Convert results to DataFrame for easy viewing
tuned_results_df = pd.DataFrame(tuned_results).T
print("Tuned Model Evaluation Results:\n", tuned_results_df)

Parameters: { "use_label_encoder" } are not used.



Tuned Model Evaluation Results:
                      Accuracy  Precision    Recall  F1 Score   ROC AUC
Logistic Regression  0.788913   0.624595  0.516043  0.565154  0.831701
Random Forest        0.802416   0.679104  0.486631  0.566978  0.831597
XGBoost              0.793888   0.681034  0.422460  0.521452  0.833521
