# One-hot Encoding
---
It is basically transform (binarication) categorical variables, where wich category will be transformed in a column.

In [13]:
import pandas as pd

df = pd.read_csv('../datasets/hum-res.csv')
df.head()

Unnamed: 0,Employee Name,Employee Number,State,Zip,DOB,Age,Sex,MaritalDesc,CitizenDesc,Hispanic/Latino,...,Date of Hire,Date of Termination,Reason For Term,Employment Status,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score
0,"Brown, Mia",1103024000.0,MA,1450.0,11/24/1985,32.0,Female,Married,US Citizen,No,...,10/27/2008,,N/A - still employed,Active,Admin Offices,Accountant I,28.5,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
1,"LaRotonda, William",1106027000.0,MA,1460.0,4/26/1984,33.0,Male,Divorced,US Citizen,No,...,1/6/2014,,N/A - still employed,Active,Admin Offices,Accountant I,23.0,Brandon R. LeBlanc,Website Banner Ads,Fully Meets
2,"Steans, Tyrone",1302053000.0,MA,2703.0,9/1/1986,31.0,Male,Single,US Citizen,No,...,9/29/2014,,N/A - still employed,Active,Admin Offices,Accountant I,29.0,Brandon R. LeBlanc,Internet Search,Fully Meets
3,"Howard, Estelle",1211051000.0,MA,2170.0,9/16/1985,32.0,Female,Married,US Citizen,No,...,2/16/2015,4/15/2015,N/A - still employed,Active,Admin Offices,Administrative Assistant,21.5,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review
4,"Singh, Nan",1307060000.0,MA,2330.0,5/19/1988,29.0,Female,Single,US Citizen,No,...,5/1/2015,,N/A - still employed,Active,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review


## Transforming a category

In [14]:
# Unique values
df['MaritalDesc'].unique()

array(['Married', 'Divorced', 'Single', 'Separated', 'widowed', nan],
      dtype=object)

In [15]:
dumm = pd.get_dummies(df['MaritalDesc'])
dumm

Unnamed: 0,Divorced,Married,Separated,Single,widowed
0,0,1,0,0,0
1,1,0,0,0,0
2,0,0,0,1,0
3,0,1,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
297,0,1,0,0,0
298,0,0,0,1,0
299,0,0,0,1,0
300,0,0,0,1,0


### Concating dummies data (one-hot encoded)

In [16]:
df_new = pd.concat([dumm, df], axis=1).drop(['MaritalDesc'], axis=1)
df_new.head()

Unnamed: 0,Divorced,Married,Separated,Single,widowed,Employee Name,Employee Number,State,Zip,DOB,...,Date of Hire,Date of Termination,Reason For Term,Employment Status,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score
0,0,1,0,0,0,"Brown, Mia",1103024000.0,MA,1450.0,11/24/1985,...,10/27/2008,,N/A - still employed,Active,Admin Offices,Accountant I,28.5,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
1,1,0,0,0,0,"LaRotonda, William",1106027000.0,MA,1460.0,4/26/1984,...,1/6/2014,,N/A - still employed,Active,Admin Offices,Accountant I,23.0,Brandon R. LeBlanc,Website Banner Ads,Fully Meets
2,0,0,0,1,0,"Steans, Tyrone",1302053000.0,MA,2703.0,9/1/1986,...,9/29/2014,,N/A - still employed,Active,Admin Offices,Accountant I,29.0,Brandon R. LeBlanc,Internet Search,Fully Meets
3,0,1,0,0,0,"Howard, Estelle",1211051000.0,MA,2170.0,9/16/1985,...,2/16/2015,4/15/2015,N/A - still employed,Active,Admin Offices,Administrative Assistant,21.5,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review
4,0,0,0,1,0,"Singh, Nan",1307060000.0,MA,2330.0,5/19/1988,...,5/1/2015,,N/A - still employed,Active,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review


In [17]:
def one_hot_encode(dataframe: pd.DataFrame, feature: str) -> pd.Series:
    dumm = pd.get_dummies(dataframe[feature])
    df_r = pd.concat([dumm, dataframe], axis=1)
    return df_r.drop([feature], axis=1)

In [18]:
df_new.columns

Index(['Divorced', 'Married', 'Separated', 'Single', 'widowed',
       'Employee Name', 'Employee Number', 'State', 'Zip', 'DOB', 'Age', 'Sex',
       'CitizenDesc', 'Hispanic/Latino', 'RaceDesc', 'Date of Hire',
       'Date of Termination', 'Reason For Term', 'Employment Status',
       'Department', 'Position', 'Pay Rate', 'Manager Name', 'Employee Source',
       'Performance Score'],
      dtype='object')

In [19]:
df_new = one_hot_encode(df_new, 'Sex')
df_new.columns

Index(['Female', 'Male', 'male', 'Divorced', 'Married', 'Separated', 'Single',
       'widowed', 'Employee Name', 'Employee Number', 'State', 'Zip', 'DOB',
       'Age', 'CitizenDesc', 'Hispanic/Latino', 'RaceDesc', 'Date of Hire',
       'Date of Termination', 'Reason For Term', 'Employment Status',
       'Department', 'Position', 'Pay Rate', 'Manager Name', 'Employee Source',
       'Performance Score'],
      dtype='object')

In [20]:
df_new.head()

Unnamed: 0,Female,Male,male,Divorced,Married,Separated,Single,widowed,Employee Name,Employee Number,...,Date of Hire,Date of Termination,Reason For Term,Employment Status,Department,Position,Pay Rate,Manager Name,Employee Source,Performance Score
0,1,0,0,0,1,0,0,0,"Brown, Mia",1103024000.0,...,10/27/2008,,N/A - still employed,Active,Admin Offices,Accountant I,28.5,Brandon R. LeBlanc,Diversity Job Fair,Fully Meets
1,0,1,0,1,0,0,0,0,"LaRotonda, William",1106027000.0,...,1/6/2014,,N/A - still employed,Active,Admin Offices,Accountant I,23.0,Brandon R. LeBlanc,Website Banner Ads,Fully Meets
2,0,1,0,0,0,0,1,0,"Steans, Tyrone",1302053000.0,...,9/29/2014,,N/A - still employed,Active,Admin Offices,Accountant I,29.0,Brandon R. LeBlanc,Internet Search,Fully Meets
3,1,0,0,0,1,0,0,0,"Howard, Estelle",1211051000.0,...,2/16/2015,4/15/2015,N/A - still employed,Active,Admin Offices,Administrative Assistant,21.5,Brandon R. LeBlanc,Pay Per Click - Google,N/A- too early to review
4,1,0,0,0,0,0,1,0,"Singh, Nan",1307060000.0,...,5/1/2015,,N/A - still employed,Active,Admin Offices,Administrative Assistant,16.56,Brandon R. LeBlanc,Website Banner Ads,N/A- too early to review
