# Customer Churn Prediction with XGBoost

Thanks to Kaggle/BLASTCHAR for the dataset.\
Loaded the dataset:

In [1]:
import pandas as pd

df = pd.read_csv('C:\\Users\\Neriukas\\Desktop\\Code\\ML_Bootcamp\\ML_Projects\\Customer_Churn_Prediction\\Telco_Customer_Churn.csv')

Looking into the data:

In [2]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


We need to map the data into numerical values:

In [3]:
df = df.drop(columns="customerID")

d = {'Yes': 1, 'No': 0}
df['Partner'] = df['Partner'].map(d)
df['Dependents'] = df['Dependents'].map(d)
df['PhoneService'] = df['PhoneService'].map(d)
df['PaperlessBilling'] = df['PaperlessBilling'].map(d)
df['Churn'] = df['Churn'].map(d)

d = {'Male': 0, 'Female': 1}
df['gender'] = df['gender'].map(d)

d = {'Yes': 0, 'No': 1, 'No phone service': 2}
df['MultipleLines'] = df['MultipleLines'].map(d)

d = {'DSL': 0, 'Fiber optic': 1, 'No': 2}
df['InternetService'] = df['InternetService'].map(d)

d = {'Yes': 0, 'No': 1, 'No internet service': 2}
df['OnlineSecurity'] = df['OnlineSecurity'].map(d)
df['OnlineBackup'] = df['OnlineBackup'].map(d)
df['DeviceProtection'] = df['DeviceProtection'].map(d)
df['TechSupport'] = df['TechSupport'].map(d)
df['StreamingTV'] = df['StreamingTV'].map(d)
df['StreamingMovies'] = df['StreamingMovies'].map(d)

d = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df['Contract'] = df['Contract'].map(d)

d = {'Electronic check': 0, 'Mailed check': 1, 'Bank transfer (automatic)': 2, 'Credit card': 3}
df['PaymentMethod'] = df['PaymentMethod'].map(d)

df['TotalCharges'].replace(" ", 0, inplace=True)
df['TotalCharges'] = df['TotalCharges'].astype(float)

df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,1,0,1,0,1,0,2,0,1,0,1,1,1,1,0,1,0.0,29.85,29.85,0
1,0,0,0,0,34,1,1,0,0,1,0,1,1,1,1,0,1.0,56.95,1889.5,0
2,0,0,0,0,2,1,1,0,0,0,1,1,1,1,0,1,1.0,53.85,108.15,1
3,0,0,0,0,45,0,2,0,0,1,0,0,1,1,1,0,2.0,42.3,1840.75,0
4,1,0,0,0,2,1,1,1,1,1,1,1,1,1,0,1,0.0,70.7,151.65,1


Splitting dataset into train data and test data:

In [4]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2)

train_labels = train_data[['Churn']].copy()
train_data = train_data.drop(columns="Churn")

test_labels = test_data[['Churn']].copy()
test_data = test_data.drop(columns="Churn")

Now we'll load up XGBoost, and convert our data into the DMatrix format it expects. One for the training data, and one for the test data.

In [5]:
import xgboost as xgb

train = xgb.DMatrix(data=train_data, label=train_labels)
test = xgb.DMatrix(data=test_data, label=test_labels)

Defining our hyperparameters. We're choosing softmax since this is a multiple classification problem.

In [6]:
param = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 19} 
epochs = 10

Let's train the model.

In [7]:
model = xgb.train(param, train, epochs)

Now we'll use the trained model to predict the customer churn:

In [12]:
predictions = model.predict(test)

Let's measure the accuracy on the test data:

In [13]:
from sklearn.metrics import accuracy_score

accuracy_score(test_labels, predictions)

0.7877927608232789

We should know what features were the most important for the model.

In [10]:
feature_importance = model.get_score(importance_type='gain')
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
sorted_features

[('Contract', 95.80201721191406),
 ('InternetService', 18.635704040527344),
 ('tenure', 9.956835746765137),
 ('MonthlyCharges', 7.705359935760498),
 ('StreamingMovies', 3.7488601207733154),
 ('PhoneService', 3.5156948566436768),
 ('OnlineSecurity', 3.081786632537842),
 ('PaymentMethod', 3.0561161041259766),
 ('PaperlessBilling', 2.882417917251587),
 ('SeniorCitizen', 2.799180507659912),
 ('TotalCharges', 2.5629758834838867),
 ('Dependents', 2.230154037475586),
 ('StreamingTV', 2.1570193767547607),
 ('MultipleLines', 2.1305668354034424),
 ('OnlineBackup', 2.0984742641448975),
 ('TechSupport', 1.9024003744125366),
 ('gender', 1.508777141571045),
 ('Partner', 1.1581915616989136),
 ('DeviceProtection', 0.9851886630058289)]

Finally, let's find out how each unique value of the most important feature contributed to the model's predictions:

In [14]:
most_important_feature = sorted_features[0][0]

test_data_with_labels = test_data.copy()
test_data_with_labels['Churn'] = test_labels
test_data_with_labels['Predictions'] = predictions

for i in test_data[most_important_feature].unique():
    segment = test_data_with_labels[test_data_with_labels[most_important_feature] == i]
    accuracy_segment = accuracy_score(segment['Churn'], segment['Predictions'])
    print(f'Value: {i}, Segment Accuracy: {accuracy_segment}')
    print(f'Churn distribution for value {i}:\n{segment["Churn"].value_counts(normalize=True)}')
    print(f'Predictions distribution for value {i}:\n{segment["Predictions"].value_counts(normalize=True)}\n')

Value: 1, Segment Accuracy: 0.9022082018927445
Churn distribution for value 1:
Churn
0    0.902208
1    0.097792
Name: proportion, dtype: float64
Predictions distribution for value 1:
Predictions
0.0    1.0
Name: proportion, dtype: float64

Value: 0, Segment Accuracy: 0.6658064516129032
Churn distribution for value 0:
Churn
0    0.56129
1    0.43871
Name: proportion, dtype: float64
Predictions distribution for value 0:
Predictions
0.0    0.624516
1.0    0.375484
Name: proportion, dtype: float64

Value: 2, Segment Accuracy: 0.9716088328075709
Churn distribution for value 2:
Churn
0    0.971609
1    0.028391
Name: proportion, dtype: float64
Predictions distribution for value 2:
Predictions
0.0    1.0
Name: proportion, dtype: float64



From the output we can clearly see that customers with Month-to-month contract type are most likely to churn.