# One Hot Encoder

In [33]:
import pandas as pd
import numpy as np
from sklearn.metrics import mutual_info_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [3]:
# load the data
df = pd.read_csv(data)

df.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
5,9305-CDSKC,Female,0,No,No,8,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,99.65,820.5,Yes
6,1452-KIOVK,Male,0,No,Yes,22,Yes,Yes,Fiber optic,No,...,No,No,Yes,No,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,No
7,6713-OKOMC,Female,0,No,No,10,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,No,Mailed check,29.75,301.9,No
8,7892-POOKP,Female,0,Yes,No,28,Yes,Yes,Fiber optic,No,...,Yes,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes
9,6388-TABGU,Male,0,No,Yes,62,Yes,No,DSL,Yes,...,No,No,No,No,One year,No,Bank transfer (automatic),56.15,3487.95,No


In [4]:
# replace column names to lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# replace categorical column values to lowercase and replace spaces with underscores
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [5]:
df.isna().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [6]:
# target value is churn column
# change churn to number, 1 if yes, 0 if no
df.churn = (df.churn == 'yes').astype(int)

In [7]:
# split data into train, val and test 60%, 20%, 20%

# test size 20%
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# val size 20%
# df_full_train = 80%
# val = 20% / 80% = 0.25
# train is remaining 60%
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [8]:
# check sizes
print("Total rows: ", len(df))
print("Training rows: ", len(df_train))
print("Validation rows: ", len(df_val))
print("Test rows: ", len(df_test))

Total rows:  7043
Training rows:  4225
Validation rows:  1409
Test rows:  1409


In [9]:
# check the data
df_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
3897,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,...,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15,0
1980,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,...,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55,0
6302,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,...,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65,1
727,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1,0
5104,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5,0


In [10]:
# reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [11]:
# separate target value and remove it from features
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

In [12]:
# check the data
df_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,8015-ihcgw,female,0,yes,yes,72,yes,yes,fiber_optic,yes,yes,yes,yes,yes,yes,two_year,yes,electronic_check,115.5,8425.15
1,1960-uycnn,male,0,no,no,10,yes,yes,fiber_optic,no,yes,yes,no,no,yes,month-to-month,yes,electronic_check,95.25,1021.55
2,9250-wypll,female,0,no,no,5,yes,yes,fiber_optic,no,no,no,no,no,no,month-to-month,no,electronic_check,75.55,413.65
3,6786-obwqr,female,0,yes,yes,5,yes,no,fiber_optic,no,no,no,no,yes,no,month-to-month,yes,electronic_check,80.85,356.1
4,1328-euzhc,female,0,yes,no,18,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,20.1,370.5


In [13]:
# check target values
y_train

array([0, 0, 1, ..., 1, 0, 1])

In [14]:
categorical_columns = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [15]:
# check how many unique values in each categorical column
df_train[categorical_columns].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [36]:
# try using smaller data
dfsmall = df_train[['tenure','gender','internetservice', 'onlinesecurity']].head(20).copy()
ysmall = y_train[:20]

In [17]:
dfsmall

Unnamed: 0,tenure,gender,internetservice,onlinesecurity
0,72,female,fiber_optic,yes
1,10,male,fiber_optic,no
2,5,female,fiber_optic,no
3,5,female,fiber_optic,no
4,18,female,no,no_internet_service
5,4,male,dsl,no
6,1,male,fiber_optic,no
7,1,female,fiber_optic,no
8,72,female,no,no_internet_service
9,6,female,fiber_optic,no


In [18]:
categorical_columns = ['gender','internetservice', 'onlinesecurity']
numeric_columns = ['tenure']

### Ohe using DictVectorizer

In [19]:
# to use DictVectorizer, we need to convert the dataframe to a list of dictionaries
# to convert dataframe to list of dictionaries, use to_dict() with parameter orient='records' 
# the output is like an array of key-value pair:
# [
#   { 'tenure': 1, 'internetservice': 'dsl', 'onlinesecurity': 'no' },
#   { 'tenure': 2, 'internetservice': 'fiber_optic', 'onlinesecurity': 'yes' },
#   ...
# ]
small_dict = dfsmall[categorical_columns + numeric_columns].to_dict(orient='records')

print(len(small_dict))



20


In [20]:
small_dict

[{'gender': 'female',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'yes',
  'tenure': 72},
 {'gender': 'male',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure': 10},
 {'gender': 'female',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure': 5},
 {'gender': 'female',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure': 5},
 {'gender': 'female',
  'internetservice': 'no',
  'onlinesecurity': 'no_internet_service',
  'tenure': 18},
 {'gender': 'male',
  'internetservice': 'dsl',
  'onlinesecurity': 'no',
  'tenure': 4},
 {'gender': 'male',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure': 1},
 {'gender': 'female',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure': 1},
 {'gender': 'female',
  'internetservice': 'no',
  'onlinesecurity': 'no_internet_service',
  'tenure': 72},
 {'gender': 'female',
  'internetservice': 'fiber_optic',
  'onlinesecurity': 'no',
  'tenure

In [21]:
# initialize DictVectorizer and fit it
# DictVectorizer will automatically convert categorical columns to one-hot encoding
# and leave numeric columns as is
# if a categorical column has k unique values, it will create k new columns
dv = DictVectorizer(sparse=False)
dv.fit(small_dict)

In [22]:
# convert to ohe for each column with type categorical
small_enc = dv.transform(small_dict)

print(small_enc)

# gender has 2 unique values: male, female
# internetservice has 3 unique values: dsl, fiber_optic, no
# onlinesecurity has 3 unique values: no, yes, no_internet_service
# tenure is numeric column, not converted to ohe

# the final number of columns is: 9
print("shape : ", small_enc.shape)

[[ 1.  0.  0.  1.  0.  0.  0.  1. 72.]
 [ 0.  1.  0.  1.  0.  1.  0.  0. 10.]
 [ 1.  0.  0.  1.  0.  1.  0.  0.  5.]
 [ 1.  0.  0.  1.  0.  1.  0.  0.  5.]
 [ 1.  0.  0.  0.  1.  0.  1.  0. 18.]
 [ 0.  1.  1.  0.  0.  1.  0.  0.  4.]
 [ 0.  1.  0.  1.  0.  1.  0.  0.  1.]
 [ 1.  0.  0.  1.  0.  1.  0.  0.  1.]
 [ 1.  0.  0.  0.  1.  0.  1.  0. 72.]
 [ 1.  0.  0.  1.  0.  1.  0.  0.  6.]
 [ 1.  0.  0.  1.  0.  0.  0.  1. 72.]
 [ 0.  1.  0.  1.  0.  1.  0.  0. 17.]
 [ 1.  0.  1.  0.  0.  0.  0.  1. 66.]
 [ 1.  0.  0.  1.  0.  1.  0.  0.  2.]
 [ 1.  0.  0.  0.  1.  0.  1.  0.  4.]
 [ 0.  1.  1.  0.  0.  1.  0.  0.  3.]
 [ 1.  0.  0.  0.  1.  0.  1.  0. 71.]
 [ 1.  0.  0.  1.  0.  1.  0.  0. 32.]
 [ 0.  1.  0.  1.  0.  1.  0.  0. 53.]
 [ 0.  1.  0.  1.  0.  1.  0.  0. 56.]]
shape :  (20, 9)


In [23]:
# feature names (column names) automatically generated by DictVectorizer
# the format : columnname=value
# check the feature names
feature_names = dv.get_feature_names_out()

feature_names

array(['gender=female', 'gender=male', 'internetservice=dsl',
       'internetservice=fiber_optic', 'internetservice=no',
       'onlinesecurity=no', 'onlinesecurity=no_internet_service',
       'onlinesecurity=yes', 'tenure'], dtype=object)

In [24]:
# try convert back to dataframe
df_ohe = pd.DataFrame(small_enc, columns=feature_names)

df_ohe

Unnamed: 0,gender=female,gender=male,internetservice=dsl,internetservice=fiber_optic,internetservice=no,onlinesecurity=no,onlinesecurity=no_internet_service,onlinesecurity=yes,tenure
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,72.0
1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,10.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0
4,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,18.0
5,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,4.0
6,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
7,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
8,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,72.0
9,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,6.0


### Ohe using OneHotEncoder

In [25]:
# OneHotEncoder don't need to convert to dictionary like DictVectorizer
# It can directly accept dataframe with categorical columns

oh_internetservice = OneHotEncoder()
oh_internetservice.fit(dfsmall[['internetservice']])

internetservice_ohe = oh_internetservice.transform(dfsmall[['internetservice']])
print(internetservice_ohe.toarray())

# print the column names generated by OneHotEncoder
# format : columnname_value
print(oh_internetservice.get_feature_names_out())

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]
['internetservice_dsl' 'internetservice_fiber_optic' 'internetservice_no']


In [26]:
# Instead of fit and transform in two steps. It can be done in one step using fit_transform
oh_gender = OneHotEncoder()
gender_ohe = oh_gender.fit_transform(dfsmall[['gender']])
print(gender_ohe.toarray())
print(oh_gender.get_feature_names_out())

[[1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [0. 1.]]
['gender_female' 'gender_male']


In [27]:
# do ohe for multiple columns directly

oh = OneHotEncoder()
oh.fit(dfsmall[categorical_columns])
ohe = oh.transform(dfsmall[categorical_columns])
print(ohe.toarray())
print(oh.get_feature_names_out())

[[1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 1. 1. 0. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]
 [1. 0. 1. 0. 0. 0. 0. 1.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0.]
 [0. 1. 1. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 1. 0. 1. 0.]
 [1. 0. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 1. 0. 0.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]
['gender_female' 'gender_male' 'internetservice_dsl'
 'internetservice_fiber_optic' 'internetservice_no' 'onlinesecurity_no'
 'onlinesecurity_no_internet_service' 'onlinesecurity_yes']


In [30]:
# convert ohe to dataframe
df_ohe = pd.DataFrame(ohe.toarray(), columns=oh.get_feature_names_out())

# combine ohe to dataframe
new_df = pd.concat([dfsmall[numeric_columns].reset_index(drop=True), df_ohe.reset_index(drop=True)], axis=1)

new_df


Unnamed: 0,tenure,gender_female,gender_male,internetservice_dsl,internetservice_fiber_optic,internetservice_no,onlinesecurity_no,onlinesecurity_no_internet_service,onlinesecurity_yes
0,72,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,10,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
2,5,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,5,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,18,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5,4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
6,1,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
7,1,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,72,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
9,6,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


Note: data transformation, should fit with train data only. don't use val and test data. This prevents data leakage.

To handle category that not appears in train data, set parameter handle_unknown="ignore".

OneHotEncoder(handle_unknown="ignore")

It will encode unknown category to all zero.

### Ohe using ColumnTransformer, Pipeline

In [32]:
dfsmall

Unnamed: 0,tenure,gender,internetservice,onlinesecurity
0,72,female,fiber_optic,yes
1,10,male,fiber_optic,no
2,5,female,fiber_optic,no
3,5,female,fiber_optic,no
4,18,female,no,no_internet_service
5,4,male,dsl,no
6,1,male,fiber_optic,no
7,1,female,fiber_optic,no
8,72,female,no,no_internet_service
9,6,female,fiber_optic,no


In [38]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

# build pipeline, first do preprocessing, then logistic regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])

model.fit(dfsmall, ysmall)

pred = model.predict(dfsmall)

print(ysmall)
print(pred)


[0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0]
