In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

import helper as h

from importlib import reload

In [4]:
train, val, test, y_train, y_val, y_test = h.split_telco_data(explore=False)

In [5]:
train.shape[0], val.shape[0], test.shape[0]

(4225, 1409, 1409)

In [6]:
train.head(2)

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges
0,male,0,no,no,3,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,month-to-month,no,mailed_check,19.85,64.55
1,female,0,yes,yes,55,yes,no,fiber_optic,no,yes,yes,yes,yes,yes,one_year,yes,bank_transfer_(automatic),103.7,5656.75


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4225 entries, 0 to 4224
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            4225 non-null   object 
 1   seniorcitizen     4225 non-null   int64  
 2   partner           4225 non-null   object 
 3   dependents        4225 non-null   object 
 4   tenure            4225 non-null   int64  
 5   phoneservice      4225 non-null   object 
 6   multiplelines     4225 non-null   object 
 7   internetservice   4225 non-null   object 
 8   onlinesecurity    4225 non-null   object 
 9   onlinebackup      4225 non-null   object 
 10  deviceprotection  4225 non-null   object 
 11  techsupport       4225 non-null   object 
 12  streamingtv       4225 non-null   object 
 13  streamingmovies   4225 non-null   object 
 14  contract          4225 non-null   object 
 15  paperlessbilling  4225 non-null   object 
 16  paymentmethod     4225 non-null   object 


In [7]:
y_train[:2]

array([1, 0], dtype=uint8)

In [10]:
train.drop(['paymentmethod', 'multiplelines', 'gender', 'phoneservice', 'totalcharges'], axis=1, inplace=True)

Encoding with `DictVectorizer`

In [12]:
# Step 1. Turn into a list of  dictionaries
train.to_dict(orient='records')[:1]

[{'seniorcitizen': 0,
  'partner': 'no',
  'dependents': 'no',
  'tenure': 3,
  'internetservice': 'no',
  'onlinesecurity': 'no_internet_service',
  'onlinebackup': 'no_internet_service',
  'deviceprotection': 'no_internet_service',
  'techsupport': 'no_internet_service',
  'streamingtv': 'no_internet_service',
  'streamingmovies': 'no_internet_service',
  'contract': 'month-to-month',
  'paperlessbilling': 'no',
  'monthlycharges': 19.85}]

In [13]:
train_dict = train.to_dict(orient='records')

In [15]:
# step 2 create a DictVectorizer and fit / transform
dv = DictVectorizer(sparse=False)
X_train_dv = dv.fit_transform(train_dict)
X_train_dv[:1]

array([[ 1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,
         0.  ,  1.  , 19.85,  0.  ,  1.  ,  0.  ,  0.  ,  1.  ,  0.  ,
         1.  ,  0.  ,  1.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.  ,  0.  ,
         1.  ,  0.  ,  0.  ,  1.  ,  0.  ,  3.  ]])

In [16]:
X_train_dv.shape

(4225, 33)

In [17]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'internetservice=dsl',
       'internetservice=fiber_optic', 'internetservice=no',
       'monthlycharges', 'onlinebackup=no',
       'onlinebackup=no_internet_service', 'onlinebackup=yes',
       'onlinesecurity=no', 'onlinesecurity=no_internet_service',
       'onlinesecurity=yes', 'paperlessbilling=no',
       'paperlessbilling=yes', 'partner=no', 'partner=yes',
       'seniorcitizen', 'streamingmovies=no',
       'streamingmovies=no_internet_service', 'streamingmovies=yes',
       'streamingtv=no', 'streamingtv=no_internet_service',
       'streamingtv=yes', 'techsupport=no',
       'techsupport=no_internet_service', 'techsupport=yes', 'tenure'],
      dtype=object)

Encoding with `OneHotEncoder`

In [32]:
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [28]:
categorical = h.get_categorical(explore=False)
numerical = h.get_numerical(explore=False)

In [30]:
categorical + numerical

['seniorcitizen',
 'partner',
 'dependents',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'tenure',
 'monthlycharges']

In [34]:
ohe = OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False)

In [44]:
X_train_ohe = np.concatenate(
    [
        ohe.fit_transform(train[categorical]),
        train[numerical]
    ], axis = 1)

In [45]:
X_train_ohe.shape

(4225, 22)

Create validation and test sets with `DictVectorizer` and `OneHotEncoder`