## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train a machine learning model to predict customer churn.

In [1]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [3]:
train.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,5282.0,5282.0,5282.0
mean,0.16471,32.375426,64.893449
std,0.370954,24.595876,30.142527
min,0.0,0.0,18.25
25%,0.0,8.0,35.5625
50%,0.0,29.0,70.35
75%,0.0,56.0,89.9875
max,1.0,72.0,118.75


In [4]:
# drop row if empty string in any column

train = train[train != ' '].dropna()
val = val[val != ' '].dropna()

train_x_orig = train.drop("Churn", axis=1)
train_y = train["Churn"]

val_x_orig = val.drop("Churn", axis=1)
val_y = val["Churn"]

In [5]:
# drop column customerID in both
val_x_orig.drop("customerID", axis=1, inplace=True)
train_x_orig.drop("customerID", axis=1, inplace=True)

In [6]:
# assert no missings

assert train_x_orig.isnull().any().any() == False
assert val_x_orig.isnull().any().any() == False

In [7]:
# labelbinarizer for target variable (train_y and val_y churn)

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
train_y = lb.fit_transform(train_y)
val_y = lb.fit_transform(val_y)

In [47]:
# create pipeline to one-hot encode categorical variables

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
import xgboost as xgb

In [57]:
# create column transformer that one-hot encodes categorical variables and standardizes numerical variables using standard scaler.

column_transformer = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(),
            ["gender", "SeniorCitizen", "Partner", "Dependents", "PhoneService", "MultipleLines", "InternetService", "OnlineSecurity", "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV", "StreamingMovies", "Contract", "PaperlessBilling", "PaymentMethod"]),
        ("standardize", StandardScaler(),
            ["tenure", "MonthlyCharges", "TotalCharges"])
    ],
    remainder="passthrough"
)

# create pipeline

from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("column_transformer", column_transformer),
    ("pca", PCA(n_components=0.95)),
    ("gbm", GradientBoostingClassifier(learning_rate=0.01, n_estimators=1000, subsample=0.8, max_depth=5))
    #("rfc", RandomForestClassifier(n_estimators=1500, max_samples=0.75, max_features=0.75))
    #('xgb' = XGBClassifier())
])

# fit pipeline

pipe.fit(train_x_orig, train_y.ravel())

In [58]:
# apply pipeline to training and validation data

pipe.score(train_x_orig, train_y)

0.9028831562974203

In [59]:
pipe.score(val_x_orig, val_y)

0.8252299605781866

In [32]:
# drop all categorical variables from train_x_orig and val_x_orig into new dataframes train_x_d and val_x_d.
# then use these dataframes to train a decision tree classifier

#train_x_d = train_x_orig[["tenure", "MonthlyCharges", "TotalCharges"]]
#val_x_d = val_x_orig[["tenure", "MonthlyCharges", "TotalCharges"]]

# train decision tree classifier

#from sklearn.tree import DecisionTreeClassifier

#dt = DecisionTreeClassifier()
#dt.fit(train_x_d, train_y)

# score decision tree classifier

#dt.score(train_x_d, train_y)


0.9897572078907435

In [61]:
#dt.score(val_x_d, val_y)

# save model and pipeline to pickle files

import pickle

pipe_pickle = open("pipe.pickle", "wb")
pickle.dump(pipe, pipe_pickle)
pipe_pickle.close()