In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import shap
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
file_path = r"C:\Users\Nathan\Documents\WGU\D603\Task 1\churn_clean.csv"
df = pd.read_csv(file_path)

In [11]:
print(df.isnull().sum())

CaseOrder                  0
Customer_id                0
Interaction                0
UID                        0
City                       0
State                      0
County                     0
Zip                        0
Lat                        0
Lng                        0
Population                 0
Area                       0
TimeZone                   0
Job                        0
Children                   0
Age                        0
Income                     0
Marital                    0
Gender                     0
Churn                      0
Outage_sec_perweek         0
Email                      0
Contacts                   0
Yearly_equip_failure       0
Techie                     0
Contract                   0
Port_modem                 0
Tablet                     0
InternetService         2129
Phone                      0
Multiple                   0
OnlineSecurity             0
OnlineBackup               0
DeviceProtection           0
TechSupport   

In [3]:
df.columns = df.columns.str.strip()

In [4]:
df = df.drop(['CaseOrder', 'Customer_id', 'UID', 'Interaction',
              'City', 'State', 'County', 'Zip', 'Lat', 'Lng', 'Email', 'TimeZone', 'Area'], axis=1)

In [14]:
df.select_dtypes(include=['object', 'category'])

Unnamed: 0,Job,Marital,Gender,Churn,Techie,Contract,Port_modem,Tablet,InternetService,Phone,Multiple,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,PaperlessBilling,PaymentMethod
0,Environmental health practitioner,Widowed,Male,No,No,One year,Yes,Yes,Fiber Optic,Yes,No,Yes,Yes,No,No,No,Yes,Yes,Credit Card (automatic)
1,"Programmer, multimedia",Married,Female,Yes,Yes,Month-to-month,No,Yes,Fiber Optic,Yes,Yes,Yes,No,No,No,Yes,Yes,Yes,Bank Transfer(automatic)
2,Chief Financial Officer,Widowed,Female,No,Yes,Two Year,Yes,No,DSL,Yes,Yes,No,No,No,No,No,Yes,Yes,Credit Card (automatic)
3,Solicitor,Married,Male,No,Yes,Two Year,No,No,DSL,Yes,No,Yes,No,No,No,Yes,No,Yes,Mailed Check
4,Medical illustrator,Separated,Male,Yes,No,Month-to-month,Yes,No,Fiber Optic,No,No,No,No,No,Yes,Yes,No,No,Mailed Check
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Sport and exercise psychologist,Married,Male,No,No,Month-to-month,Yes,Yes,DSL,Yes,Yes,No,Yes,Yes,No,No,No,No,Electronic Check
9996,Consulting civil engineer,Divorced,Male,No,No,Two Year,No,No,Fiber Optic,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,Electronic Check
9997,IT technical support officer,Never Married,Female,No,No,Month-to-month,No,No,Fiber Optic,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Bank Transfer(automatic)
9998,Water engineer,Separated,Male,No,No,Two Year,No,Yes,Fiber Optic,No,Yes,No,No,No,Yes,Yes,Yes,Yes,Credit Card (automatic)


In [5]:
df = pd.get_dummies(df, columns=['Job', 'Marital', 'Gender', 'Churn', 'Techie', 'Contract', 'Port_modem', 
'Tablet', 'InternetService', 'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 
        'StreamingMovies', 'PaperlessBilling', 'PaymentMethod'], drop_first=True)


In [18]:
df.to_csv('churn_cleaned.csv', index=False)

In [6]:
train_val, test = train_test_split(df, test_size=0.2, random_state=42)

train, val = train_test_split(train_val, test_size=0.25, random_state=42)

train.to_csv('train.csv', index=False)
val.to_csv('validation.csv', index=False)
test.to_csv('test.csv', index=False)

print("Training, Validation, and Test CSV files have been successfully exported.")

Training, Validation, and Test CSV files have been successfully exported.


In [7]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

X_train = train.drop('Churn_Yes', axis=1) 
y_train = train['Churn_Yes']

X_val = val.drop('Churn_Yes', axis=1)
y_val = val['Churn_Yes']

model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
y_pred_prob = model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_prob)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8825
Precision: 0.8114754098360656
Recall: 0.7346938775510204
F1 Score: 0.7711781888997079
AUC-ROC: 0.9505497924389095
Confusion Matrix:
[[1369   92]
 [ 143  396]]


In [9]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score

param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = GradientBoostingClassifier(random_state=42)

grid = GridSearchCV(model, param_grid, scoring='f1', cv=cv, n_jobs=-1)

grid.fit(X_train, y_train)

print("Best Params :", grid.best_params_)
print("Best F1 score during CV :", grid.best_score_)

Best Params : {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best F1 score during CV : 0.7947797838729395


In [10]:
optimized_model = grid.best_estimator_

y_pred = optimized_model.predict(X_val)
y_pred_prob = optimized_model.predict_proba(X_val)[:, 1]

accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_pred_prob)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"AUC-ROC: {auc}")
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

Accuracy: 0.884
Precision: 0.8063872255489022
Recall: 0.7495361781076066
F1 Score: 0.7769230769230769
AUC-ROC: 0.9489510196462381
Confusion Matrix:
[[1364   97]
 [ 135  404]]
