# Customer Churn Prediction

* Dataset: https://www.kaggle.com/datasets/blastchar/telco-customer-churn

## Data Preparation

In [2]:
# import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [4]:
# read the data using pandas `read_csv` method.
data = pd.read_csv("data.csv")

In [6]:
# view first five rows
data.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [17]:
# Columns
print("Number of columns::", data.shape[1])
print("Columns::", data.columns.to_list())

Number of columns:: 21
Columns:: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']


In [22]:
# change the column name case into lowercase and space with '_'.
data.columns = data.columns.str.lower().str.replace(" ", "_")

In [18]:
# Change the string case of categorical variable values.
categorical_cols = data.select_dtypes(include='object').columns.tolist()

for col in categorical_cols:
    data[col] = data[col].str.lower().str.replace(" ", "_")

In [27]:
# Let's check the data again
data.head(2).T

Unnamed: 0,0,1
customerid,7590-vhveg,5575-gnvde
gender,female,male
seniorcitizen,0,0
partner,yes,no
dependents,no,no
tenure,1,34
phoneservice,no,yes
multiplelines,no_phone_service,no
internetservice,dsl,dsl
onlinesecurity,no,yes


In [24]:
# datatypes
data.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [37]:
# Convert the `totalcharges` datatype form object to numerics
data['totalcharges'] = pd.to_numeric(data.totalcharges,errors='coerce')

# Convert the `churn` datatype as well
data['churn'] = (data.churn =='yes').astype('int')

## Setting up validation framework
Splitting data into train/validation/test datasets using `train_test_split` function

In [44]:
from sklearn.model_selection import train_test_split

# Split the data into X and y.
X = data.drop(columns=['churn'], axis=1).copy()
y = data['churn'].copy()

# first split the data into ratio 80:20 for training and test dataset
X_full, X_test, y_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# next, split the data X_full and y_full into train, and validation dataset
X_train, X_val, y_train, y_val = train_test_split(X_full, y_full, test_size=0.25, random_state=42)

In [48]:
print("X_train shape::", X_train.shape)
print("X_val shape::", X_val.shape)
print("X_test shape::", X_test.shape)

X_train shape:: (4225, 20)
X_val shape:: (1409, 20)
X_test shape:: (1409, 20)
