## One Hot Encoding

- Replacing categorical variables by a matrix of boolean variables 

- Each variable is called a dummy variable 

- For gender, we can have variables such as; Male, Female and Non-Binary 

## Number of Dummies 

- Pandas and sklearn provide K dummy variables; where K is the number of unique labels in the variable 

- When K=2, drop one dummy variable 

- When K!=2, drop one dummy variable if the underlying variables provide complete information even without K variables

- Should always use K-1 dummies for linear regression models because it **looks** at all the variables while fitting to the train set 






In [306]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [307]:
from google.colab import drive
drive.mount('/content/gdrive')
data = pd.read_csv("gdrive/My Drive/Colab Notebooks/FeatureEngineering/train.csv")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [308]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [309]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [310]:
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

In [311]:
# get number of categories in variables 
categoricals = []
for col in data.columns:
    if data[col].dtypes =='O':
      print('{} categories : {} '.format(col, len(data[col].unique())))
      categoricals.append(col)

Sex categories : 2 
Ticket categories : 681 
Cabin categories : 148 
Embarked categories : 4 


In [312]:
# Get variables with more than n categories 
n = 8
cats = []
for col in data.columns:
    if data[col].dtypes =='O': 
        if len(data[col].unique())<n: 
            print('{} categories : {} '.format(col, len(data[col].unique())))
            cats.append(col)

Sex categories : 2 
Embarked categories : 4 


In [313]:
for col in cats:
    if data[col].dtypes =='O': # if the variable is categorical
      print(100*data.groupby(col)[col].count()/np.float(len(data)))
      print()

Sex
female    35.241302
male      64.758698
Name: Sex, dtype: float64

Embarked
C    18.855219
Q     8.641975
S    72.278339
Name: Embarked, dtype: float64



In [315]:
pd.get_dummies(data['Sex']).head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [318]:
pd.concat([data, pd.get_dummies(data['Sex'])], axis=1).head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,female,male
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,1,0
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1,0
3,1,1,female,35.0,1,0,113803,53.1,C123,S,1,0
4,0,3,male,35.0,0,0,373450,8.05,,S,0,1


In [319]:
pd.get_dummies(data['Sex'], drop_first=True).head()

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1


In [321]:
pd.concat([data, pd.get_dummies(data['Sex'], drop_first=True)], axis=1).head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,1,1,female,35.0,1,0,113803,53.1,C123,S,0
4,0,3,male,35.0,0,0,373450,8.05,,S,1


In [322]:
pd.get_dummies(data['Embarked']).head()

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [323]:
pd.concat([data, pd.get_dummies(data['Embarked'])], axis=1).head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,C,Q,S
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,0,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,1,0,0
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,0,1
3,1,1,female,35.0,1,0,113803,53.1,C123,S,0,0,1
4,0,3,male,35.0,0,0,373450,8.05,,S,0,0,1


In [324]:
pd.get_dummies(data['Embarked'], drop_first=True).head()

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


In [325]:
pd.concat([data, pd.get_dummies(data['Embarked'], drop_first=True)], axis=1).head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Q,S
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S,0,1
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,0,0
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,1
3,1,1,female,35.0,1,0,113803,53.1,C123,S,0,1
4,0,3,male,35.0,0,0,373450,8.05,,S,0,1


## Doing this in Sklearn 

In [326]:
from sklearn.preprocessing import LabelEncoder
data_t = data.copy()
obj = LabelEncoder()
data_t['Sex'] = obj.fit_transform(data['Sex'])


In [327]:
data_t.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,A/5 21171,7.25,,S
1,1,1,0,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,0,35.0,1,0,113803,53.1,C123,S
4,0,3,1,35.0,0,0,373450,8.05,,S


In [338]:
data_save = data.copy()

In [339]:
data.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked'],
      dtype='object')

In [340]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(handle_unknown = 'ignore')
temp = onehotencoder.fit_transform(data.iloc[:, [2]]).toarray()
#temp_test = onehotencoder.transform(data_test.iloc[:, [1]]).toarray()

data = data.drop(columns = [data.columns[2]])
#data_test = data_test.drop(columns = [data_test.columns[1]])

data = pd.concat([pd.DataFrame(temp), data], axis=1)
#data_test = pd.concat([pd.DataFrame(temp_test), data_test], axis=1)

In [341]:
data.head()

Unnamed: 0,0,1,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0.0,1.0,0,3,22.0,1,0,A/5 21171,7.25,,S
1,1.0,0.0,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,1.0,0.0,1,3,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1.0,0.0,1,1,35.0,1,0,113803,53.1,C123,S
4,0.0,1.0,0,3,35.0,0,0,373450,8.05,,S


In [343]:
from sklearn.preprocessing import OneHotEncoder
data = data_save.copy()
onehotencoder = OneHotEncoder(drop = 'first')
temp = onehotencoder.fit_transform(data.iloc[:, [2]]).toarray()
#temp_test = onehotencoder.transform(data_test.iloc[:, [1]]).toarray()

data = data.drop(columns = [data.columns[2]])
#data_test = data_test.drop(columns = [data_test.columns[1]])

data = pd.concat([pd.DataFrame(temp), data], axis=1)
#data_test = pd.concat([pd.DataFrame(temp_test), data_test], axis=1)

In [344]:
data.head()

Unnamed: 0,0,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0,3,22.0,1,0,A/5 21171,7.25,,S
1,0.0,1,1,38.0,1,0,PC 17599,71.2833,C85,C
2,0.0,1,3,26.0,0,0,STON/O2. 3101282,7.925,,S
3,0.0,1,1,35.0,1,0,113803,53.1,C123,S
4,1.0,0,3,35.0,0,0,373450,8.05,,S
