
# Telecom Customer Churn Prediction

## Packages

In [1]:
import pandas as pd
import numpy as np
import os

# Matplotlib for visualization
from matplotlib import pyplot as plt
# display plots in the notebook
%matplotlib inline

# Seaborn for easier visualization
import seaborn as sns

# # scikit-learn
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.compose import ColumnTransformer, make_column_transformer

# from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

# Function for creating model pipelines - imblearn
from imblearn.pipeline import make_pipeline as imbl_pipe

# # Over-sampling using SMOTE
from imblearn.over_sampling import SMOTE

# Classification metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

import joblib


## Load Analytical Base Table

In [2]:
df = pd.read_csv("./Resources/Analytical_Base_Table.csv")
print(f"Dataframe dimensions: {df.shape}")
df.head()

Dataframe dimensions: (768, 9)


Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,30.5,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,30.5,26.6,0.351,31.0,0
2,8.0,183.0,64.0,23.0,30.5,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,0.471876,33.0,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   preg    768 non-null    float64
 1   plas    768 non-null    float64
 2   pres    768 non-null    float64
 3   skin    768 non-null    float64
 4   test    768 non-null    float64
 5   mass    768 non-null    float64
 6   pedi    768 non-null    float64
 7   age     768 non-null    float64
 8   class   768 non-null    int64  
dtypes: float64(8), int64(1)
memory usage: 54.1 KB


### Separate dataframe into separate object

In [4]:
X = df.drop(["class"], axis=1)

y = df["class"]

# display shapes of X and y
print(X.shape, y.shape)

(768, 8) (768,)


## Create a Train Test Split

In [5]:
random_state = 10

# Split X and y into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=random_state)

# Print number of observations in X_train, X_test, y_train, and y_test
print(len(X_train), len(X_test), len(y_train), len(y_test))

537 231 537 231


## Load Saved Models

In [8]:
dt_model = joblib.load('./models/challa_decision_tree.sav')
knn_model = joblib.load('./models/challa_knn.sav')
lr_model = joblib.load("./models/challa_logistic_regression.sav")
rf_model = joblib.load('./models/challa_random_forest.sav')
xgb_model = joblib.load('./models/challa_XGBoost_model.sav')


**Dictionary `'models'`**

In [24]:
# Create models dictionary, it will be needed for ploting
models = {
    'dt' : 'Decision Tree',
    'knn' : 'K-nearest Neighbors',
    'lr' : 'Logistic Regression',
    'rf' : 'Random Forest',
    'xgb' : 'XGBoost'
}

**Dictionary `'loaded_models'`**

In [9]:
# Dictionary of all loaded models
loaded_models = {
    'dt' : dt_model,
    'knn': knn_model,
    'lr' : lr_model,
    'rf' : rf_model,
    'xgb' : xgb_model
}

'target_names' variable will be used later for printing evaluation results.

In [10]:
target_names = ['Non-Diabetic', 'Diabetic']

### Helper Functions

**The function for creating the dataframe with evaluation metrics for each model.**

<pre>input: loaded models dictionary
output: evaluation metrics dataframe</pre>

In [11]:
def evaluation_test(fit_models):
    lst = []
    for name, model in fit_models.items():
        pred = model.predict(X_test)
        lst.append([name, 
                    precision_score(y_test, pred, average='macro'),
                    recall_score(y_test, pred, average='macro'),
                    f1_score(y_test, pred, average='macro'),
                    accuracy_score(y_test, pred)])

    eval_df = pd.DataFrame(lst, columns=['model', 'precision', 'recall', 'f1_macro', 'accuracy'])
    eval_df.set_index('model', inplace = True)
    return eval_df

**The helper function for displaying confusion matrix and classification report.**

<pre>input: loaded models dictionary, models dictionary and a dictionary key for one of the models
output: confusion matrix dataframe and classification report</pre>

In [12]:
def class_rep_cm(fit_models, models, model_id):
    # Predict classes using model_id
    pred = fit_models[model_id].predict(X_test)
    print()
    print('\t', models[model_id])
    print('\t', '='*len(models[model_id]))

    # Display confusion matrix for y_test and pred
    conf_df = pd.DataFrame(confusion_matrix(y_test, pred), columns=target_names, index=target_names)
    conf_df.index.name = 'True Labels'
    conf_df = conf_df.rename_axis('Predicted Labels', axis='columns')
    display(conf_df)
    
    # Display classification report
    print()
    print(classification_report(y_test, pred, target_names=target_names))


In [13]:
def evaluation_train(fit_models):
    lst = []
    for name, model in fit_models.items():
        pred = model.predict(X_train)
        lst.append([name, 
                    precision_score(y_train, pred, average='macro'),
                    recall_score(y_train, pred, average='macro'),
                    f1_score(y_train, pred, average='macro'),
                    accuracy_score(y_train, pred)])

    eval_df = pd.DataFrame(lst, columns=['model', 'precision', 'recall', 'f1_macro', 'accuracy'])
    eval_df.set_index('model', inplace = True)
    return eval_df

### Display evaluation metrics

In [14]:
evaluation_train(loaded_models)

Unnamed: 0_level_0,precision,recall,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dt,0.765367,0.784212,0.771506,0.787709
knn,0.739897,0.766986,0.740149,0.750466
lr,0.734242,0.750877,0.739437,0.757914
rf,0.827849,0.852241,0.835993,0.8473
xgb,0.916278,0.93717,0.924924,0.931099


In [15]:
evaluation_test(loaded_models)

Unnamed: 0_level_0,precision,recall,f1_macro,accuracy
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dt,0.732554,0.730364,0.731395,0.748918
knn,0.705763,0.715158,0.708021,0.718615
lr,0.721412,0.729047,0.724014,0.735931
rf,0.749847,0.761255,0.752941,0.761905
xgb,0.73284,0.725814,0.728741,0.748918


In [16]:
models_train_comp_df1 = evaluation_train(loaded_models)
models_train_comp_df1 = models_train_comp_df1.reset_index().rename(columns={'model': 'Training_Set'})
# models_train_comp_df1
models_train_comp_df1.to_csv('Training_set_performance.csv')

In [17]:
models_test_comp_df1 = evaluation_test(loaded_models)
models_test_comp_df1 = models_test_comp_df1.reset_index().rename(columns={'model': 'Testing_Set'})
# models_test_comp_df1
models_test_comp_df1.to_csv('Testing_set_performance.csv')

During cross-validation we were trying two scorers, f1_macro and accuracy, and then used a model that had better recal for true positive ("Exits"). 

### Display confusion matrix and classification report 

In [19]:
# Display classification report and confusion matrix for all models

for model in models.keys():
    class_rep_cm(loaded_models, models, model)

NameError: name 'models' is not defined

**Data Description:**

* The target variable is 'Churn'. It has a value of 1 for churn and 0 for not churn.
* There are a lot of binary variables with 'Yes/No' values.
* There are three continuous variables: tenure, monthly charges, and total charges.
* The shape of the data is (6499,21)

**Data Cleaning:**

* The column 'Total_Charges' had 9 missing values. I have imputed the values with median of Total Charges
* In the continuous variables, there are no outliers.

**Key Observations from EDA:**

* `Tenure`: The average tenure of customers with the company is around 32 months.
* `Monthly_Charges`: Average monthly charges is 64.77 USD.
* `Total_Charges`: Average total charges is 2282.94 USD. The distribution is skewed slightly to the right.
* `Senior Citizen`: About 16% of customers are senior citizens.
* `Dependents`: More than 70% of customers don't have dependents.
* `Phone_Services`: More than 90% of customers have phone services enabled.
* `Internet_Service`: 44% of customers use Fibre Optic for internet service. 34% use DSL, while the rest don't have internet services at all. 
* `Contract`: There are 55% customers with month-to-month contracts. Other two types of contract are: One-year and Two-year
* `Payment_Method`: Electronic check is the most used payment method among the four methods of payment.
* `Churn`: The churn rate in the data is about 26%.
* `Churn vs Senior_Citizen`: Among Senior Citizen customers, the churn rate is about 41%. Senior Citizens are more likely to churn compare to others.
* `Churn vs Internet_Service`: Among customers who don't use Internet Service, the churn rate is very low(8%). While, the churn rate is highest for Fibre Optic users(42%).
* `Churn vs Contract`: As the length of contract increases, the likelihood of churning decreases. 43% of monthly contract customers are likely to churn, followed by 11% of one-year contracts, while two-year contract customers have the least churn rate of 3%
* `Churn vs Payment_Method`: Customers with Electronic Check payment have a higher churn rate than any other payment method.
* `Churn vs Tenure`: As tenure increases, the customers are less likely to churn. Customers with low tenure have churned the most.
* `Churn vs Charges`: Customers who have churned, have higher monthly charges but lower total charges.
* `Contract vs Internet_Service`: Among the month-to-month contract customers, the most used service for Internet is Fiber Optic. Among the one-year and two-year contract customers, DSL service is more used as compared to Fiber Optic.
* `Contract vs Payment_Method`: Among the month-to-month contract customers, Electronic check method of payment is used extensively. Among the one-year and two-year contract customers, Credit Card and Bank transfer methods of payment are more used as compared to other methods
* `Internet_Service vs Payment_Method`: Customers without internet service use the mailed check payment method the most. Customers with Fibre Optice internet service use the Electronic Check method the most.

* In many other columns, like Online_Security, Online_Backup, Tech_Support, Streaming_Movies, etc. there is a level named 'No internet service'. Moreover, the count for the 'No internet service' level is also the same in all columns. This means that customers with No internet service don't have access to many other services like online security, streaming movies, etc.

### Conclusions


* K-Nearest Neighbors and Random Forest models overfit the data and is not able to generalise well.
* The accuracies of Logistic Regression, Decision Tree and XGBoost models perform well in both training and test dataset.
* Logistic Regression have given a generalised performance with high recall and high precision.
* Overall, let's see what the precision and recall means in customer churn-

**Precision** - Of all the customers that the algorithm predicts will churn, how many of them do actually churn?

**Recall** – What percentage of customers that end up churning does the algorithm successfully find?

Both precision and recall values are important for customer churn.

High recall and low precision means the model is unnecessarily predicting non-churned customers as churned, adding overhead to the business.



### Recommendations

* The company should attract the customers likely to churn with bonus plans or discounts on recharges.
* The company should improve Fiber optic internet services as its use is somehow increasing the probability of churn
* The company should try to attract the newly joined customers with attractive offers so that they stay longer with the network. And also, try to make long-term contracts with the customers.
* As per observations from the model, customers who use the internet to stream TV and movies have higher chances of churning than those who don't. This may be due to the poor internet connection faced by the customer. The company should improve the internet connectivity to check this.
* Online Security and Tech Support should be provided to as many customers as possible