# Churn Models for Telco Company Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("C:/Users/nejat/OneDrive/Desktop/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.columns # To see all the columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [5]:
# I dropped customer ID. Because this is not gonna have any bearing on our model.
df.drop('customerID', axis=1, inplace =True)

In [6]:
df.info()
# We have 20 columns.
# And We have 7043 rows of data.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [7]:
#The first thing that i would like to do is to 'replace the churn column with 1 and 0.'.
#Because if we are gonna use this in sckit-learn or in an ml model, we need to ensure that everything is numerical.
# inplace =true to remain.
df['Churn'].replace({'Yes':1, 'No':0}, inplace =True)

In [8]:
# change the column to numeric. it was in an object form, if you look the  df.info()
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors= 'coerce')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


In [10]:
# Then I would like to check if there is any null values.
df.isnull().any().sum()

1

In [11]:
# I dropped all the Na values and said inplace equals true.
df.dropna(axis=0, inplace =True)

In [12]:
# Then I wanted to take a look at the two columns, 'tenure' and 'MonthlyCharges'.
df[['tenure', 'MonthlyCharges']].describe()

Unnamed: 0,tenure,MonthlyCharges
count,7032.0,7032.0
mean,32.421786,64.798208
std,24.54526,30.085974
min,1.0,18.25
25%,9.0,35.5875
50%,29.0,70.35
75%,55.0,89.8625
max,72.0,118.75


In [13]:
# Next, i wanted to see how the numerical features are corrolated.
df.corr()
#Senior citizens function is probably not correct.

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
SeniorCitizen,1.0,0.015683,0.219874,0.102411,0.150541
tenure,0.015683,1.0,0.246862,0.82588,-0.354049
MonthlyCharges,0.219874,0.246862,1.0,0.651065,0.192858
TotalCharges,0.102411,0.82588,0.651065,1.0,-0.199484
Churn,0.150541,-0.354049,0.192858,-0.199484,1.0


In [14]:
#Then I decided to use get_dummies to one hot encode or turn all the categoritcal features into numeric.
df = pd.get_dummies(df)

In [15]:
df.head()
#As you can see, all of our categorical variables (ex. gender; Male/Female)


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,...,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_No,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,0,1,29.85,29.85,0,1,0,0,1,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,0,0,1,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,0,2,53.85,108.15,1,0,1,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,0,0,1,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,0,2,70.7,151.65,1,1,0,1,0,1,...,0,1,0,0,0,1,0,0,1,0


In [16]:
df.dtypes
# Just to double check that tye types are numerical.
# Actually dropping some of the rows might have improved the model performance, but I did not drop any unfortunately.

SeniorCitizen                                int64
tenure                                       int64
MonthlyCharges                             float64
TotalCharges                               float64
Churn                                        int64
gender_Female                                uint8
gender_Male                                  uint8
Partner_No                                   uint8
Partner_Yes                                  uint8
Dependents_No                                uint8
Dependents_Yes                               uint8
PhoneService_No                              uint8
PhoneService_Yes                             uint8
MultipleLines_No                             uint8
MultipleLines_No phone service               uint8
MultipleLines_Yes                            uint8
InternetService_DSL                          uint8
InternetService_Fiber optic                  uint8
InternetService_No                           uint8
OnlineSecurity_No              

In [17]:
# I created a X an y variable.
# So, X is going to be the features that I want. The ones that are dependent features.
# y is going to be our target variable or churn.
X= df.drop('Churn', axis=1)
y= df['Churn']

In [22]:
#We are trying to predict churn. So, this is a Yes and No. We are trying to predict the class in a sense.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
#Yes and No question this is why RandomForest
from sklearn.ensemble import RandomForestClassifier
#Yes and No question. This is why logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#Yes and No question. This is why XGBoost.
from xgboost import XGBClassifier


In [23]:
#Then I used train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y)

#We also need to split the data between  train test and split. 
#So we can use train_test_split to split the data btween training and the test sets.

In [24]:
# So, what I wanted to do here is , I wanted to create a table where I can see the model performance.
# I built a dataframe to hold the performance and I am going to put the score and the title of the algorithm there.

model_eval = pd.DataFrame(index =['Random Forest'], columns =[' Score'])


In [25]:
# So, I first started creating or instantiating the algorithm.
# So, I brought in random forest. I saved as rf.
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
# Now i am able to fit the training set to each one of our algorithms .
# So I fit that first with random forest by calling rf.fit function and then fitting and then using a prediction and then saving as ypred.
ypred = rf.predict(X_test)

In [26]:
log = LogisticRegression()
log.fit(X_train, y_train)
y_pred2 = log.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
xg =XGBClassifier()
xg.fit(X_train,y_train)
y_pred3= xg.predict(X_test)





In [33]:
#I started loading in randomforrest score, logistic regression score and XGBoost Score.
# I saved in the dataframe called model_eval.

model_eval.loc['Random Forest','Score'] = accuracy_score(ypred,y_test)
model_eval.loc['Logistic','Score'] = accuracy_score(y_pred2,y_test)
model_eval.loc['XGBoost','Score'] = accuracy_score(y_pred3,y_test)

In [34]:
model_eval

Unnamed: 0,Score,Score.1
Random Forest,,0.788396
Logistic,,0.812856
XGBoost,,0.80603


In [35]:
#I saved this as model to bring it in powerbi
# it s like searialize version of the model.
import joblib
joblib.dump(log, "model")

['model']

In [36]:
#Then I wanted to see the feature importance and put that in a dataframe or a table.
rf.feature_importances_
# We can see which features are most important for churn.


array([0.01967382, 0.1422916 , 0.13848593, 0.16541125, 0.01796335,
       0.01738502, 0.01496045, 0.01491565, 0.01278914, 0.01281379,
       0.0031233 , 0.0028595 , 0.01343394, 0.00300777, 0.01386138,
       0.01113238, 0.01900402, 0.00180523, 0.03635645, 0.00387525,
       0.01063576, 0.01902613, 0.00395237, 0.01207123, 0.01535515,
       0.00152011, 0.0121989 , 0.02315369, 0.00226065, 0.01098369,
       0.01130205, 0.00377068, 0.0112526 , 0.0119624 , 0.00214768,
       0.01161131, 0.04846887, 0.0137308 , 0.01674047, 0.01519701,
       0.01488281, 0.01202639, 0.01206337, 0.0275451 , 0.01099154])

In [37]:
weights = pd.DataFrame(rf.feature_importances_, index= X.columns.values)

In [38]:
weights
# I want this whole list in my powerbi

Unnamed: 0,0
SeniorCitizen,0.019674
tenure,0.142292
MonthlyCharges,0.138486
TotalCharges,0.165411
gender_Female,0.017963
gender_Male,0.017385
Partner_No,0.01496
Partner_Yes,0.014916
Dependents_No,0.012789
Dependents_Yes,0.012814
