In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
%matplotlib inline

from imblearn.over_sampling import SMOTENC


In [44]:
df = pd.read_csv('/content/drive/MyDrive/ML Projects/Churn-Prediction/Telco-Customer-Churn.csv')

In [45]:
len(df)

7043

## Initial data preparation

In [46]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [47]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [48]:
#drop customerID. no use for our model
df = df.drop('customerID', axis=1)

In [49]:
df.dtypes

Unnamed: 0,0
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object
OnlineBackup,object


In [50]:
#df['Total charges'] is an object class however it should be a number
df['TotalCharges']

Unnamed: 0,TotalCharges
0,29.85
1,1889.5
2,108.15
3,1840.75
4,151.65
...,...
7038,1990.5
7039,7362.9
7040,346.45
7041,306.6


In [51]:
# because there are values that do not convert to number because use the errors='coerce' any non-numeric value encountered during the conversion process will be replaced with NaN (Not a Number)
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')


In [52]:
# there where 11 values where a space was there and was changed to NaN
df['TotalCharges'].isnull().sum()

np.int64(11)

In [53]:
#replace NaN with 0, might be bad for model
df['TotalCharges'] = df['TotalCharges'].fillna(0)

In [54]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

In [55]:
# we want binary 1 or 0 for class. Isolate Yes and replace with number
df.churn = (df.churn == 'yes').astype(int)

In [56]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no
onlinebackup,yes,no,yes,no,no


In [57]:
# we have an imbalance in classes we will have to fix this soon
df['churn'].value_counts()

Unnamed: 0_level_0,count
churn,Unnamed: 1_level_1
0,5174
1,1869


In [58]:
#spliting data into training set for 80 percent of the data and 20 percent for testing
X_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [59]:
X = X_train_full.drop('churn', axis=1)
y = X_train_full['churn']

In [60]:
#spliting again for testing and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=11)
len(X_train),len(X_val),len(df_test)

(4225, 1409, 1409)

In [61]:
#what is the class imbalence for the training set? will need to adjust the imbalance later.
y_train.value_counts()

Unnamed: 0_level_0,count
churn,Unnamed: 1_level_1
0,3070
1,1155


## Exploratory data analysis

In [62]:
# great no missing values
X_train_full.isnull().sum()

Unnamed: 0,0
gender,0
seniorcitizen,0
partner,0
dependents,0
tenure,0
phoneservice,0
multiplelines,0
internetservice,0
onlinesecurity,0
onlinebackup,0


In [63]:
# look at the ratio of classes in train_full (training + validation sets) churn rate of 27%
X_train_full.churn.value_counts(normalize=True)

Unnamed: 0_level_0,proportion
churn,Unnamed: 1_level_1
0,0.730032
1,0.269968


In [64]:
global_mean = X_train_full.churn.mean()
round(global_mean, 3)

np.float64(0.27)

In [65]:
# seperating categorical and numerical classes
categorical = ['gender', 'seniorcitizen', 'partner', 'dependents',
               'phoneservice', 'multiplelines', 'internetservice',
               'onlinesecurity', 'onlinebackup', 'deviceprotection',
               'techsupport', 'streamingtv', 'streamingmovies',
               'contract', 'paperlessbilling', 'paymentmethod']
numerical = ['tenure', 'monthlycharges', 'totalcharges']

In [66]:
# some are binary or multile values
X_train_full[categorical].nunique()

Unnamed: 0,0
gender,2
seniorcitizen,2
partner,2
dependents,2
phoneservice,2
multiplelines,3
internetservice,3
onlinesecurity,3
onlinebackup,3
deviceprotection,3


## Feature importance

In [67]:
#churn rate by gender category, not really that different from global churn rate
female_mean = X_train_full[X_train_full.gender == 'female'].churn.mean()
print('gender == female:', round(female_mean, 3))

male_mean = X_train_full[X_train_full.gender == 'male'].churn.mean()
print('gender == male:  ', round(male_mean, 3))

gender == female: 0.277
gender == male:   0.263


In [68]:
# risk ratio for women
female_mean / global_mean

np.float64(1.0253955354648652)

In [69]:
# risk ratio for men
male_mean / global_mean

np.float64(0.9749802969838747)

In [70]:
# churn rate for partner, no partner is higher
partner_yes = X_train_full[X_train_full.partner == 'yes'].churn.mean()
print('partner == yes:', round(partner_yes, 3))

partner_no = X_train_full[X_train_full.partner == 'no'].churn.mean()
print('partner == no :', round(partner_no, 3))

partner == yes: 0.205
partner == no : 0.33


In [71]:
# lower risk ratio for customers who have a partner
partner_yes / global_mean

np.float64(0.7594724924338315)

In [72]:
partner_no / global_mean

np.float64(1.2216593879412643)

In [73]:
df_group = X_train_full.groupby(by='gender').churn.agg(['mean'])
df_group['diff'] = df_group['mean'] - global_mean
df_group['risk'] = df_group['mean'] / global_mean
df_group

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


In [74]:
from IPython.display import display

In [75]:
global_mean = X_train_full.churn.mean()
global_mean

np.float64(0.26996805111821087)

In [76]:
for col in categorical:
    df_group = X_train_full.groupby(by=col).churn.agg(['mean'])
    df_group['diff'] = df_group['mean'] - global_mean
    df_group['risk'] = df_group['mean'] / global_mean
    display(df_group)

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.276824,0.006856,1.025396
male,0.263214,-0.006755,0.97498


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.24227,-0.027698,0.897403
1,0.413377,0.143409,1.531208


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.329809,0.059841,1.221659
yes,0.205033,-0.064935,0.759472


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.31376,0.043792,1.162212
yes,0.165666,-0.104302,0.613651


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241316,-0.028652,0.89387
yes,0.273049,0.003081,1.011412


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257407,-0.012561,0.953474
no_phone_service,0.241316,-0.028652,0.89387
yes,0.290742,0.020773,1.076948


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192347,-0.077621,0.712482
fiber_optic,0.425171,0.155203,1.574895
no,0.077805,-0.192163,0.288201


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.420921,0.150953,1.559152
no_internet_service,0.077805,-0.192163,0.288201
yes,0.153226,-0.116742,0.56757


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404323,0.134355,1.497672
no_internet_service,0.077805,-0.192163,0.288201
yes,0.217232,-0.052736,0.80466


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.395875,0.125907,1.466379
no_internet_service,0.077805,-0.192163,0.288201
yes,0.230412,-0.039556,0.85348


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.418914,0.148946,1.551717
no_internet_service,0.077805,-0.192163,0.288201
yes,0.159926,-0.110042,0.59239


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.342832,0.072864,1.269897
no_internet_service,0.077805,-0.192163,0.288201
yes,0.302723,0.032755,1.121328


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.338906,0.068938,1.255358
no_internet_service,0.077805,-0.192163,0.288201
yes,0.307273,0.037305,1.138182


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.431701,0.161733,1.599082
one_year,0.120573,-0.149395,0.446621
two_year,0.028274,-0.241694,0.10473


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172071,-0.097897,0.637375
yes,0.338151,0.068183,1.25256


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168171,-0.101797,0.622928
credit_card_(automatic),0.164339,-0.10563,0.608733
electronic_check,0.45589,0.185922,1.688682
mailed_check,0.19387,-0.076098,0.718121


## Dealing with the class imbalance.

In [77]:
#using SMOTE-NC to adjust the class imbalance
smote_nc = SMOTENC(categorical_features = categorical, random_state=42)
X_train_smote, y_train_smote = smote_nc.fit_resample(X_train, y_train)

In [89]:
#Class ratios are the same now. does it change modeling performance?
y_train_smote.value_counts()

Unnamed: 0_level_0,count
churn,Unnamed: 1_level_1
0,3070
1,3070


For Categorical feature we use mutual information

In [78]:
from sklearn.metrics import mutual_info_score

In [79]:
def calculate_mi(series):
    return mutual_info_score(series, X_train_full.churn)

df_mi = X_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')


display(df_mi.head())
display(df_mi.tail())

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923


Unnamed: 0,MI
partner,0.009968
seniorcitizen,0.00941
multiplelines,0.000857
phoneservice,0.000229
gender,0.000117


For Numerical feature we use correlation

In [80]:
X_train_full[numerical].corrwith(X_train_full.churn).to_frame('correlation')

Unnamed: 0,correlation
tenure,-0.351885
monthlycharges,0.196805
totalcharges,-0.196353


In [81]:
X_train_full.groupby(by='churn')[numerical].mean()

Unnamed: 0_level_0,tenure,monthlycharges,totalcharges
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,37.531972,61.176477,2548.021627
1,18.070348,74.521203,1545.689415


## One-hot encoding

In [82]:
from sklearn.feature_extraction import DictVectorizer

In [83]:
train_dict = X_train[categorical + numerical].to_dict(orient='records')

In [84]:
train_dict[0]

{'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'no',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'one_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'credit_card_(automatic)',
 'tenure': 58,
 'monthlycharges': 105.2,
 'totalcharges': 6225.4}

In [85]:
# take the dictionary and turn it into a vector
dv = DictVectorizer(sparse=False) # do not use a sparse matrix to save space
dv.fit(train_dict)

In [86]:
X_train = dv.transform(train_dict)

In [87]:
X_train.shape

(4225, 45)

In [95]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [99]:
pd.DataFrame(dv.transform(train_dict))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,58.0,6225.40
1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,16.0,1378.25
2,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,71.0,1378.45
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,5.0,318.50
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,47.0,4045.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4220,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,63.0,6705.70
4221,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,142.35
4222,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,28.30
4223,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,23.0,470.60


## Training logistic regression

In [100]:
from sklearn.linear_model import LogisticRegression

In [101]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [109]:
model.coef_[0].round(3)

array([ 0.573, -0.166, -0.538, -0.016, -0.114,  0.076, -0.114, -0.093,
       -0.026, -0.105, -0.353,  0.336, -0.114,  0.001, -0.171,  0.112,
       -0.072,  0.12 , -0.114, -0.137,  0.263, -0.114, -0.28 , -0.207,
        0.076, -0.107, -0.024, -0.056, -0.116,  0.208, -0.167,  0.112,
       -0.243,  0.251, -0.099, -0.114,  0.081, -0.094, -0.114,  0.077,
        0.203, -0.114, -0.22 , -0.065,  0.   ])

In [102]:
val_dict = X_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [110]:
#hard predictions
model.predict(X_train)

array([0, 1, 0, ..., 0, 0, 0])

In [103]:
#soft predictions second element of each list is the probabilty of churning
model.predict_proba(X_val)

array([[0.76799773, 0.23200227],
       [0.75226245, 0.24773755],
       [0.72286754, 0.27713246],
       ...,
       [0.99773199, 0.00226801],
       [0.8774217 , 0.1225783 ],
       [0.99855878, 0.00144122]])

In [104]:
y_pred = model.predict_proba(X_val)[:, 1]

In [105]:
y_pred

array([0.23200227, 0.24773755, 0.27713246, ..., 0.00226801, 0.1225783 ,
       0.00144122])

In [111]:
churn = y_pred > 0.5
churn

array([False, False, False, ..., False, False, False])

In [143]:
y_val

Unnamed: 0,churn
5805,0
3687,1
6900,0
1644,1
6975,0
...,...
2755,0
5354,0
4331,0
1866,0


In [141]:
df_pred = pd.DataFrame()
df_pred['prob'] = y_pred
df_pred['pred'] = churn.astype(int)
df_pred['actual'] = y_val.to_list()

In [142]:
df_pred

Unnamed: 0,prob,pred,actual
0,0.232002,0,0
1,0.247738,0,1
2,0.277132,0,0
3,0.355825,0,1
4,0.053981,0,0
...,...,...,...
1404,0.028544,0,0
1405,0.786681,1,0
1406,0.002268,0,0
1407,0.122578,0,0


In [107]:
(y_val == churn).mean()

np.float64(0.7991483321504613)

## Model interpretation

In [144]:
model.intercept_[0]

np.float64(-0.1309377295167705)

In [146]:
#zip pairs elements in two lists together. dict turns into key:value pairs
dict(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

{'contract=month-to-month': np.float64(0.573),
 'contract=one_year': np.float64(-0.166),
 'contract=two_year': np.float64(-0.538),
 'dependents=no': np.float64(-0.016),
 'dependents=yes': np.float64(-0.114),
 'deviceprotection=no': np.float64(0.076),
 'deviceprotection=no_internet_service': np.float64(-0.114),
 'deviceprotection=yes': np.float64(-0.093),
 'gender=female': np.float64(-0.026),
 'gender=male': np.float64(-0.105),
 'internetservice=dsl': np.float64(-0.353),
 'internetservice=fiber_optic': np.float64(0.336),
 'internetservice=no': np.float64(-0.114),
 'monthlycharges': np.float64(0.001),
 'multiplelines=no': np.float64(-0.171),
 'multiplelines=no_phone_service': np.float64(0.112),
 'multiplelines=yes': np.float64(-0.072),
 'onlinebackup=no': np.float64(0.12),
 'onlinebackup=no_internet_service': np.float64(-0.114),
 'onlinebackup=yes': np.float64(-0.137),
 'onlinesecurity=no': np.float64(0.263),
 'onlinesecurity=no_internet_service': np.float64(-0.114),
 'onlinesecurity=yes

In [149]:
subset = ['contract', 'tenure', 'totalcharges']
train_dict_small = X_train_full[subset].to_dict(orient='records')
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

X_small_train = dv_small.transform(train_dict_small)

dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'tenure', 'totalcharges'], dtype=object)

In [160]:
model_small = LogisticRegression(solver='liblinear', random_state=1)
model_small.fit(X_small_train, y)

In [161]:
model_small.intercept_[0]

np.float64(-0.6941824333336133)

In [163]:
dict(zip(dv_small.get_feature_names_out(), model_small.coef_[0].round(3)))

{'contract=month-to-month': np.float64(0.921),
 'contract=one_year': np.float64(-0.212),
 'contract=two_year': np.float64(-1.403),
 'tenure': np.float64(-0.094),
 'totalcharges': np.float64(0.001)}

In [169]:
X_val = pd.DataFrame(X_val)

In [170]:
val_dict_small = X_val[subset].to_dict(orient='records')
X_small_val = dv_small.transform(val_dict_small)

KeyError: "None of [Index(['contract', 'tenure', 'totalcharges'], dtype='object')] are in the [columns]"

In [None]:
y_pred_small = model_small.predict_proba(X_small_val)[:, 1]

## Using the model

In [171]:
customer = {
    'customerid': '8879-zkjof',
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'no',
    'dependents': 'no',
    'tenure': 41,
    'phoneservice': 'yes',
    'multiplelines': 'no',
    'internetservice': 'dsl',
    'onlinesecurity': 'yes',
    'onlinebackup': 'no',
    'deviceprotection': 'yes',
    'techsupport': 'yes',
    'streamingtv': 'yes',
    'streamingmovies': 'yes',
    'contract': 'one_year',
    'paperlessbilling': 'yes',
    'paymentmethod': 'bank_transfer_(automatic)',
    'monthlycharges': 79.85,
    'totalcharges': 3320.75,
}

In [172]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

np.float64(0.05899309891013804)

In [173]:
print(list(X_test[0]))

[np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(1.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(79.85), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(1.0), np.float64(1.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(0.0), np.float64(0.0), np.float64(1.0), np.float64(41.0), np.float64(3320.75)]


In [174]:
customer = {
    'gender': 'female',
    'seniorcitizen': 1,
    'partner': 'no',
    'dependents': 'no',
    'phoneservice': 'yes',
    'multiplelines': 'yes',
    'internetservice': 'fiber_optic',
    'onlinesecurity': 'no',
    'onlinebackup': 'no',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'yes',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 85.7,
    'totalcharges': 85.7
}

In [175]:
X_test = dv.transform([customer])
model.predict_proba(X_test)[0, 1]

np.float64(0.827329098964807)