# Logistic Regression

Logistic Regression basically is Linear Regression + sigmoid

sigmod function will act like a filter to separate / grouping result from Linear Regression into 2 group 0 or 1.

The output can be hard or soft. Hard result is 0 or 1. And soft result in probability between 0 and 1. 
From soft result, we can convert to hard by adding a threshold. Example, with threshold 0.5, it means every value < 0.5 will be treat as 0, otherwise as 1.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [4]:
# load the data
df = pd.read_csv(data)

df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [5]:
# replace column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# replace categorical column values to lowercase and replace spaces with underscores
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
# drop column customerid
del df['customerid']

In [7]:
# convert totalcharges to numeric, coerce errors to NaN
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
# fill NaN with 0
df.totalcharges = df.totalcharges.fillna(0)

In [8]:
# target value is churn column
# change churn to number, 1 if yes, 0 if no
df.churn = (df.churn == 'yes').astype(int)

In [9]:
# split data into train, val and test 60%, 20%, 20%

# test size 20%
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# val size 20%
# df_full_train = 80%
# val = 20% / 80% = 0.25
# train is remaining 60%
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [10]:
# reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
# separate target value and remove it from features
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

Try logistic regression with a small of columns

In [12]:
# try using smaller columns
categorical_columns = ['gender','internetservice', 'onlinesecurity']
numeric_columns = ['tenure']

df_train_small = df_train[categorical_columns + numeric_columns]
df_val_small = df_val[categorical_columns + numeric_columns]

In [13]:
train_small_dict = df_train_small.to_dict(orient='records')
val_small_dict = df_val_small.to_dict(orient='records')

print(len(train_small_dict))
print(len(val_small_dict))

4225
1409


In [14]:
dv = DictVectorizer(sparse=False)
dv.fit(train_small_dict)

In [15]:
# convert to ohe for each column with type categorical
train_small_enc = dv.transform(train_small_dict)
val_small_enc = dv.transform(val_small_dict)

print(train_small_enc)

# the final number of columns is: 9
print("shape : ", train_small_enc.shape)
print("shape : ", val_small_enc.shape)

[[ 1.  0.  0. ...  0.  1. 72.]
 [ 0.  1.  0. ...  0.  0. 10.]
 [ 1.  0.  0. ...  0.  0.  5.]
 ...
 [ 0.  1.  0. ...  0.  0.  2.]
 [ 1.  0.  0. ...  1.  0. 27.]
 [ 1.  0.  0. ...  0.  0.  9.]]
shape :  (4225, 9)
shape :  (1409, 9)


In [16]:
model_lr = LogisticRegression()

model_lr.fit(train_small_enc, y_train)

pred = model_lr.predict(val_small_enc)

print(y_val)
print(pred)

print( (y_val == pred).astype(int).mean() )

[0 0 0 ... 0 1 1]
[0 0 0 ... 0 1 1]
0.7955997161107168


Model interpretation

In [17]:
w = model_lr.coef_[0]
print(w)

[-0.06533444 -0.10303857 -0.40004796  0.84900828 -0.61733334  0.52177766
 -0.61733334 -0.07281734 -0.04317372]


In [18]:
dv.get_feature_names_out()

array(['gender=female', 'gender=male', 'internetservice=dsl',
       'internetservice=fiber_optic', 'internetservice=no',
       'onlinesecurity=no', 'onlinesecurity=no_internet_service',
       'onlinesecurity=yes', 'tenure'], dtype=object)

In [19]:
dict(zip(dv.get_feature_names_out(), w.round(3)))

{'gender=female': -0.065,
 'gender=male': -0.103,
 'internetservice=dsl': -0.4,
 'internetservice=fiber_optic': 0.849,
 'internetservice=no': -0.617,
 'onlinesecurity=no': 0.522,
 'onlinesecurity=no_internet_service': -0.617,
 'onlinesecurity=yes': -0.073,
 'tenure': -0.043}

From parameter w, we can interpret:
* both gender generally have less to churn. But female has more to churn than male.
* internet service = fiberoptic have high posibility to churn
* user without online security have high posibility to churn
* high tenure have lower posibility to churn 

Try using full columns

In [20]:
categorical_columns = [
'gender',
'partner',
'dependents',
'phoneservice',
'multiplelines',
'internetservice',
'onlinesecurity',
'onlinebackup',
'deviceprotection',
'techsupport',
'streamingtv',
'streamingmovies',
'contract',
'paperlessbilling',
'paymentmethod'
]

numeric_columns = [
    'seniorcitizen',
    'tenure',
    'monthlycharges',
    'totalcharges'
]


In [21]:
# encode all categorical columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

# build pipeline, first do preprocessing, then logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model.fit(df_train, y_train)


In [22]:
pred_val = model.predict(df_val)

(pred_val == y_val).mean()

0.8048261178140526

In [23]:
model.predict_proba(df_val)

array([[0.99298507, 0.00701493],
       [0.79721074, 0.20278926],
       [0.78498971, 0.21501029],
       ...,
       [0.84770499, 0.15229501],
       [0.21100775, 0.78899225],
       [0.18678502, 0.81321498]])