In [92]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

In [93]:
df = pd.read_csv("telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.T  # transposed so we can see more data at a time

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7033,7034,7035,7036,7037,7038,7039,7040,7041,7042
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU,9305-CDSKC,1452-KIOVK,6713-OKOMC,7892-POOKP,6388-TABGU,...,9767-FFLEM,0639-TSIQW,8456-QDAVC,7750-EYXWZ,2569-WGERO,6840-RESVB,2234-XADUH,4801-JZAZL,8361-LTMKD,3186-AJIEK
gender,Female,Male,Male,Male,Female,Female,Male,Female,Female,Male,...,Male,Female,Male,Female,Female,Male,Female,Female,Male,Male
SeniorCitizen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
Partner,Yes,No,No,No,No,No,No,No,Yes,No,...,No,No,No,No,No,Yes,Yes,Yes,Yes,No
Dependents,No,No,No,No,No,No,Yes,No,No,Yes,...,No,No,No,No,No,Yes,Yes,Yes,No,No
tenure,1,34,2,45,2,8,22,10,28,62,...,38,67,19,12,72,24,72,11,4,66
PhoneService,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Yes,...,Yes,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Yes
MultipleLines,No phone service,No,No,No phone service,No,Yes,Yes,No phone service,Yes,No,...,No,Yes,No,No phone service,No,Yes,Yes,No phone service,Yes,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic,Fiber optic,Fiber optic,DSL,Fiber optic,DSL,...,Fiber optic,Fiber optic,Fiber optic,DSL,No,DSL,Fiber optic,DSL,Fiber optic,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No,No,No,Yes,No,Yes,...,No,Yes,No,No,No internet service,Yes,No,Yes,No,Yes


In [94]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

As TotalCharges column is object there must be some non-numeric values in the column, we can use to_numeric function of pandas to convert into the numeric values and keep errors='coerce' so that the non-numeric values such as white spaces are converted to NaN

In [95]:
total_charges = pd.to_numeric(df['TotalCharges'],errors='coerce')
df[total_charges.isnull()][['customerID','TotalCharges']]

Unnamed: 0,customerID,TotalCharges
488,4472-LVYGI,
753,3115-CZMZD,
936,5709-LVOEQ,
1082,4367-NUYAO,
1340,1371-DWPAZ,
3331,7644-OMVMY,
3826,3213-VVOLG,
4380,2520-SGTTA,
5218,2923-ARZLG,
6670,4075-WKNIU,


Replacing NaN values with zero

In [96]:
df.TotalCharges = pd.to_numeric(df.TotalCharges,errors = 'coerce')
df.TotalCharges = df.TotalCharges.fillna(0)
df.TotalCharges.dtype, df.TotalCharges.isnull().sum()

(dtype('float64'), 0)

Making the column names with same naming convention

In [97]:
df.columns = df.columns.str.lower().str.replace(' ','_')
df.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

Making the values in column with same naming convention

In [98]:
string_columns = list(df.dtypes[df.dtypes == "object"].index)
for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ','_')

Converting our target variable which is categorical to int. First we will set `yes` as `boolean True` and `no` as ` boolean false` and then type cast it into int which will make `True` as `1` and `False` as `0`.

In [99]:
df.churn = df.churn == 'yes'  
df.churn  # They are converted into boolean

0       False
1       False
2        True
3       False
4        True
        ...  
7038    False
7039    False
7040    False
7041     True
7042    False
Name: churn, Length: 7043, dtype: bool

In [100]:
df.churn = df.churn.astype(int)
df.churn  # Converted to integer

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: churn, Length: 7043, dtype: int32

In [101]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7033,7034,7035,7036,7037,7038,7039,7040,7041,7042
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu,9305-cdskc,1452-kiovk,6713-okomc,7892-pookp,6388-tabgu,...,9767-fflem,0639-tsiqw,8456-qdavc,7750-eyxwz,2569-wgero,6840-resvb,2234-xaduh,4801-jzazl,8361-ltmkd,3186-ajiek
gender,female,male,male,male,female,female,male,female,female,male,...,male,female,male,female,female,male,female,female,male,male
seniorcitizen,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
partner,yes,no,no,no,no,no,no,no,yes,no,...,no,no,no,no,no,yes,yes,yes,yes,no
dependents,no,no,no,no,no,no,yes,no,no,yes,...,no,no,no,no,no,yes,yes,yes,no,no
tenure,1,34,2,45,2,8,22,10,28,62,...,38,67,19,12,72,24,72,11,4,66
phoneservice,no,yes,yes,no,yes,yes,yes,no,yes,yes,...,yes,yes,yes,no,yes,yes,yes,no,yes,yes
multiplelines,no_phone_service,no,no,no_phone_service,no,yes,yes,no_phone_service,yes,no,...,no,yes,no,no_phone_service,no,yes,yes,no_phone_service,yes,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic,fiber_optic,fiber_optic,dsl,fiber_optic,dsl,...,fiber_optic,fiber_optic,fiber_optic,dsl,no,dsl,fiber_optic,dsl,fiber_optic,fiber_optic
onlinesecurity,no,yes,yes,yes,no,no,no,yes,no,yes,...,no,yes,no,no,no_internet_service,yes,no,yes,no,yes


Now we will be splitting into the train and test dataset. We will use train_test_split from sklearn.model_selection

In [102]:
from sklearn.model_selection import train_test_split
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
# this function first shuffles the dataframe and then splits it into train and test datasets

We wil be needing the validation dataset thus splitting the `df_train_full` into two parts using the same function

In [103]:
df_train, df_val = train_test_split(df_train_full, test_size=0.3, random_state=1)

Preparing y matrix for train and validation dataset

In [104]:
y_train = df_train.churn.values
y_val = df_val.churn.values

del df_train['churn']
del df_val['churn']

## Exploratory data analysis

In [105]:
df_train_full.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [106]:
# Cslculating the churn rate
global_mean = df_train_full.churn.mean().round(3)  # our dataset is the example of imbalanced dataset

In [107]:
df_train.dtypes

customerid           object
gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
dtype: object

In [108]:
categorical = ['gender','seniorcitizen','partner','dependents','phoneservice','multiplelines','internetservice','onlinesecurity','onlinebackup','deviceprotection','techsupport','streamingtv','streamingmovies','contract','paperlessbilling','paymentmethod']
numerical = ['tenure','monthlycharges','totalcharges']

In [109]:
# Checking for the unique values for categorical variables
df_train_full[categorical].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

Now we have to select the features that effect our target variable (churn). We will be calculating the churn rate for each of the features and subract them from the global churn rate, if the difference is large then it is an important feature deciding churn.

In [110]:
# Gender
female_mean = df_train_full[df_train_full.gender == 'female'].churn.mean().round(3)
male_mean = df_train_full[df_train_full.gender == 'male'].churn.mean().round(3)
female_mean, male_mean, global_mean

(0.277, 0.263, 0.27)

We can see that the difference is not large thus we can ignore the feature = 'gender'

Similarly performing for feature = 'partner'

In [111]:
partner_yes = df_train_full[df_train_full.partner == 'yes'].churn.mean().round(3)
partner_no = df_train_full[df_train_full.partner == 'no'].churn.mean().round(3)
partner_yes, partner_no, global_mean

(0.205, 0.33, 0.27)

The difference between the global_mean and the churn means of with or without partner is considerable. Thus it is an important feature. Also, the clients with no partner are more likely to churn.

We need to create a function that can do this work for all the columns

In [112]:
for col in categorical:
    df_group = df_train_full.groupby(by = col).churn.agg(['mean']).round(3)
    df_group['diff'] = global_mean - df_group['mean']
    df_group['risk'] = df_group['mean']/global_mean
    display(df_group)  # we had to use display explicitly because we were not able to see the content of dataframe in `loop`

Unnamed: 0_level_0,mean,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.277,-0.007,1.025926
male,0.263,0.007,0.974074


Unnamed: 0_level_0,mean,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.242,0.028,0.896296
1,0.413,-0.143,1.52963


Unnamed: 0_level_0,mean,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.33,-0.06,1.222222
yes,0.205,0.065,0.759259


Unnamed: 0_level_0,mean,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.314,-0.044,1.162963
yes,0.166,0.104,0.614815


Unnamed: 0_level_0,mean,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.241,0.029,0.892593
yes,0.273,-0.003,1.011111


Unnamed: 0_level_0,mean,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.257,0.013,0.951852
no_phone_service,0.241,0.029,0.892593
yes,0.291,-0.021,1.077778


Unnamed: 0_level_0,mean,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
dsl,0.192,0.078,0.711111
fiber_optic,0.425,-0.155,1.574074
no,0.078,0.192,0.288889


Unnamed: 0_level_0,mean,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.421,-0.151,1.559259
no_internet_service,0.078,0.192,0.288889
yes,0.153,0.117,0.566667


Unnamed: 0_level_0,mean,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.404,-0.134,1.496296
no_internet_service,0.078,0.192,0.288889
yes,0.217,0.053,0.803704


Unnamed: 0_level_0,mean,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.396,-0.126,1.466667
no_internet_service,0.078,0.192,0.288889
yes,0.23,0.04,0.851852


Unnamed: 0_level_0,mean,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.419,-0.149,1.551852
no_internet_service,0.078,0.192,0.288889
yes,0.16,0.11,0.592593


Unnamed: 0_level_0,mean,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.343,-0.073,1.27037
no_internet_service,0.078,0.192,0.288889
yes,0.303,-0.033,1.122222


Unnamed: 0_level_0,mean,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.339,-0.069,1.255556
no_internet_service,0.078,0.192,0.288889
yes,0.307,-0.037,1.137037


Unnamed: 0_level_0,mean,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
month-to-month,0.432,-0.162,1.6
one_year,0.121,0.149,0.448148
two_year,0.028,0.242,0.103704


Unnamed: 0_level_0,mean,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
no,0.172,0.098,0.637037
yes,0.338,-0.068,1.251852


Unnamed: 0_level_0,mean,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bank_transfer_(automatic),0.168,0.102,0.622222
credit_card_(automatic),0.164,0.106,0.607407
electronic_check,0.456,-0.186,1.688889
mailed_check,0.194,0.076,0.718519


If variables are mutually depending with the target variable we can consider it as an important feature, otherwise it is independent and we can remove that feature from our dataset.
We will use `mutual_info_score` from `sklearn.metrics`

We use the apply method to apply the calculate_mi function we defined in A to each column of the df_train_full dataframe. Because we include an additional step of selecting only categorical variables, it’s applied only to them. The function we define in A takes only one parameter: series. This is a column from the dataframe on which we invoked the apply() method.

In [113]:
from unicodedata import name
from sklearn.metrics import mutual_info_score
def calculate_mi(series):
    return mutual_info_score(series, df_train_full.churn)

df_mi = df_train_full[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name = 'MI')
df_mi

Unnamed: 0,MI
contract,0.09832
onlinesecurity,0.063085
techsupport,0.061032
internetservice,0.055868
onlinebackup,0.046923
deviceprotection,0.043453
paymentmethod,0.04321
streamingtv,0.031853
streamingmovies,0.031581
paperlessbilling,0.017589


For numerical features we will use correlation

In [114]:
df_corr = df_train_full[numerical].corrwith(df_train_full.churn)
df_corr = df_corr.to_frame(name='correlation')
df_corr

Unnamed: 0,correlation
tenure,-0.351885
monthlycharges,0.196805
totalcharges,-0.196353


Now we need to turn the categorical features to numeric values. We will be using one hot encoding (DictVectorizer). This first converts dataframe into dictionary and then the dictionary is vectorized to matrix which is further used to train model.

In [115]:
train_dict = df_train[categorical+numerical].to_dict(orient='rows')
train_dict[0]

  train_dict = df_train[categorical+numerical].to_dict(orient='rows')


{'gender': 'male',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'no',
 'multiplelines': 'no_phone_service',
 'internetservice': 'dsl',
 'onlinesecurity': 'yes',
 'onlinebackup': 'no',
 'deviceprotection': 'no',
 'techsupport': 'yes',
 'streamingtv': 'no',
 'streamingmovies': 'no',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'tenure': 7,
 'monthlycharges': 34.5,
 'totalcharges': 279.25}

Passing this dictionary to DictVectorizer which will convert it into matrix.

In [116]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False) # sparse=False means it will be creating a simple NumPy array and not sparse matrix
dv.fit(train_dict)  # we invoke the function with `fit`, it does one hot encoding on categorical features and leave numerical values intact

DictVectorizer(sparse=False)

In [117]:
# Transforming the dictionary to create matrix
X_train = dv.transform(train_dict)
X_train[0]

array([  1.  ,   0.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   1.  ,   1.  ,   0.  ,   0.  ,  34.5 ,   0.  ,   1.  ,
         0.  ,   1.  ,   0.  ,   0.  ,   0.  ,   0.  ,   1.  ,   0.  ,
         1.  ,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   1.  ,
         0.  ,   0.  ,   1.  ,   0.  ,   0.  ,   1.  ,   0.  ,   0.  ,
         0.  ,   0.  ,   1.  ,   7.  , 279.25])

In [118]:
dv.get_feature_names()



['contract=month-to-month',
 'contract=one_year',
 'contract=two_year',
 'dependents=no',
 'dependents=yes',
 'deviceprotection=no',
 'deviceprotection=no_internet_service',
 'deviceprotection=yes',
 'gender=female',
 'gender=male',
 'internetservice=dsl',
 'internetservice=fiber_optic',
 'internetservice=no',
 'monthlycharges',
 'multiplelines=no',
 'multiplelines=no_phone_service',
 'multiplelines=yes',
 'onlinebackup=no',
 'onlinebackup=no_internet_service',
 'onlinebackup=yes',
 'onlinesecurity=no',
 'onlinesecurity=no_internet_service',
 'onlinesecurity=yes',
 'paperlessbilling=no',
 'paperlessbilling=yes',
 'partner=no',
 'partner=yes',
 'paymentmethod=bank_transfer_(automatic)',
 'paymentmethod=credit_card_(automatic)',
 'paymentmethod=electronic_check',
 'paymentmethod=mailed_check',
 'phoneservice=no',
 'phoneservice=yes',
 'seniorcitizen',
 'streamingmovies=no',
 'streamingmovies=no_internet_service',
 'streamingmovies=yes',
 'streamingtv=no',
 'streamingtv=no_internet_servic

Now our training matrix is ready, so we will be starting the training of model with sklearn

Logistic regression is a probability based model, so it is necessary to keep the values of our model between 0 and 1. For this we will be using a function named, sigmoid function. The only difference between linear and logistic regression is this function.

In [119]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear',random_state=1)
model.fit(X_train,y_train)

LogisticRegression(random_state=1, solver='liblinear')

The model is ready, checking it on the validation dataset

In [120]:
val_dict = df_val[categorical+numerical].to_dict(orient='rows')
X_val = dv.transform(val_dict)

  val_dict = df_val[categorical+numerical].to_dict(orient='rows')


Now to predict the values we will be using predict_proba

In [121]:
y_pred = model.predict_proba(X_val)
y_pred

array([[0.99229883, 0.00770117],
       [0.78855282, 0.21144718],
       [0.77767238, 0.22232762],
       ...,
       [0.39611959, 0.60388041],
       [0.28253079, 0.71746921],
       [0.8100772 , 0.1899228 ]])

Now we know that if one value is p then the other is 1-p therefore removing one column from the matrix (i.e. not churned)

In [122]:
y_pred = model.predict_proba(X_val)[:,1]
y_pred 

array([0.00770117, 0.21144718, 0.22232762, ..., 0.60388041, 0.71746921,
       0.1899228 ])

These are soft predictions, but we need the hard predictions, so that we can decide if we want to send the promotional message to the usr or not.Thus we will be selecting a threshold value on which we will decide if the user will churn or not. (0.5)

In [123]:
churn = y_pred >= 0.5
churn

array([False, False, False, ...,  True,  True, False])

In [124]:
(y_val == churn).mean()

0.8095801301005322

This tells us that our model correctly predicts in 80% of the cases

In [125]:
model.intercept_[0], model.coef_[0]  # w0, w

(-0.11151016608475146,
 array([ 6.02661201e-01, -2.05642119e-02, -6.93607155e-01, -2.32435457e-02,
        -8.82666204e-02,  4.25299065e-02, -9.49701947e-02, -5.90698779e-02,
        -3.73772110e-02, -7.41329551e-02, -3.72244113e-01,  3.55704142e-01,
        -9.49701947e-02,  2.00459958e-03, -2.70270833e-01,  1.48595047e-01,
         1.01656194e-02,  6.42515029e-02, -9.49701947e-02, -8.07914743e-02,
         2.19463021e-01, -9.49701947e-02, -2.36002993e-01, -2.47669684e-01,
         1.36159518e-01, -1.24440678e-01,  1.29305122e-02, -1.10286439e-01,
        -2.90141952e-02,  8.82570015e-02, -6.04665329e-02,  1.48595047e-01,
        -2.60105213e-01,  1.86782367e-01, -1.01034662e-01, -9.49701947e-02,
         8.44946909e-02, -5.52626249e-02, -9.49701947e-02,  3.87226535e-02,
         1.71299322e-01, -9.49701947e-02, -1.87839294e-01, -6.66255365e-02,
         3.75844467e-04]))

Mapping the feature name with the values of w

In [126]:
dict(zip(dv.get_feature_names(),model.coef_[0].round(3)))



{'contract=month-to-month': 0.603,
 'contract=one_year': -0.021,
 'contract=two_year': -0.694,
 'dependents=no': -0.023,
 'dependents=yes': -0.088,
 'deviceprotection=no': 0.043,
 'deviceprotection=no_internet_service': -0.095,
 'deviceprotection=yes': -0.059,
 'gender=female': -0.037,
 'gender=male': -0.074,
 'internetservice=dsl': -0.372,
 'internetservice=fiber_optic': 0.356,
 'internetservice=no': -0.095,
 'monthlycharges': 0.002,
 'multiplelines=no': -0.27,
 'multiplelines=no_phone_service': 0.149,
 'multiplelines=yes': 0.01,
 'onlinebackup=no': 0.064,
 'onlinebackup=no_internet_service': -0.095,
 'onlinebackup=yes': -0.081,
 'onlinesecurity=no': 0.219,
 'onlinesecurity=no_internet_service': -0.095,
 'onlinesecurity=yes': -0.236,
 'paperlessbilling=no': -0.248,
 'paperlessbilling=yes': 0.136,
 'partner=no': -0.124,
 'partner=yes': 0.013,
 'paymentmethod=bank_transfer_(automatic)': -0.11,
 'paymentmethod=credit_card_(automatic)': -0.029,
 'paymentmethod=electronic_check': 0.088,
 '

Using the model to make a prediction

In [127]:
customer = { 
 'customerid': '8879-zkjof', 
 'gender': 'female', 
 'seniorcitizen': 0, 
 'partner': 'no', 
 'dependents': 'no', 
 'tenure': 41, 
 'phoneservice': 'yes', 
 'multiplelines': 'no', 
 'internetservice': 'dsl', 
 'onlinesecurity': 'yes', 
 'onlinebackup': 'no', 
 'deviceprotection': 'yes', 
 'techsupport': 'yes', 
 'streamingtv': 'yes', 
 'streamingmovies': 'yes', 
 'contract': 'one_year', 
 'paperlessbilling': 'yes', 
 'paymentmethod': 'bank_transfer_(automatic)', 
 'monthlycharges': 79.85, 
 'totalcharges': 3320.75, 
}

In [128]:
X_test = dv.transform([customer])
X_test

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 7.98500e+01, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        1.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 4.10000e+01, 3.32075e+03]])

In [129]:
model.predict_proba(X_test)[0,1]  # churning of this user is ~6%

0.05663530368469988

In [130]:
customer2 = { 
 'gender': 'female', 
 'seniorcitizen': 1, 
 'partner': 'no', 
 'dependents': 'no', 
 'phoneservice': 'yes', 
 'multiplelines': 'yes', 
 'internetservice': 'fiber_optic', 
 'onlinesecurity': 'no', 
 'onlinebackup': 'no', 
 'deviceprotection': 'no', 
 'techsupport': 'no', 
 'streamingtv': 'yes', 
 'streamingmovies': 'no', 
 'contract': 'month-to-month', 
 'paperlessbilling': 'yes', 
 'paymentmethod': 'electronic_check', 
 'tenure': 1, 
 'monthlycharges': 85.7, 
 'totalcharges': 85.7 
} 

In [131]:
X_test2 = dv.transform([customer2])
model.predict_proba(X_test2)[0,1]  # churning of this user is ~80%

0.8014948171592655