In [1]:
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import numpy as np

# One hot encoder with built in majors

In [2]:
enc= OneHotEncoder()

In [7]:
# if estimators finds any unknown categories, it will generate error
majors = [['Engineering'],
         ['Math'],
         ['Chemistry']]

In [4]:
enc.fit(majors)

OneHotEncoder()

In [5]:
enc.transform(majors).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [6]:
enc.categories_

[array(['Chemistry', 'Engineering', 'Math'], dtype=object)]

In [17]:
majors = [['Media Studies'],
         ['Math'],
         ['Stats']]

In [18]:
enc.transform(majors).toarray()

ValueError: Found unknown categories ['Media Studies', 'Stats'] in column 0 during transform

In [19]:
#this ignores the unknown categories
enc_unk=OneHotEncoder(handle_unknown='ignore')

enc_unk.fit(majors)

OneHotEncoder(handle_unknown='ignore')

In [21]:
enc_unk.transform(majors).toarray()

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

# One hot encoding for our given files

In [40]:
data = pd.read_csv('datasets/exams.csv')

In [41]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,associate's degree,standard,none,55,61,58
1,female,group B,high school,free/reduced,none,59,70,66
2,female,group D,some high school,free/reduced,none,17,18,26
3,male,group A,associate's degree,free/reduced,none,63,61,58
4,female,group E,some high school,standard,none,70,80,79


In [42]:
data['parental level of education'].unique()

array(["associate's degree", 'high school', 'some high school',
       "bachelor's degree", "master's degree", 'some college'],
      dtype=object)

In [43]:
parental_ohe = OneHotEncoder()

In [44]:
parental_ohe_transformed = parental_ohe.fit_transform(data[['parental level of education']])

parental_ohe_transformed

<100x6 sparse matrix of type '<class 'numpy.float64'>'
	with 100 stored elements in Compressed Sparse Row format>

In [45]:
parental_ohe_transformed.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0.

In [46]:
data_df = pd.DataFrame(parental_ohe_transformed.toarray(), columns = parental_ohe.categories_, dtype=np.int)

In [47]:
data_df

Unnamed: 0,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
0,1,0,0,0,0,0
1,0,0,1,0,0,0
2,0,0,0,0,0,1
3,1,0,0,0,0,0
4,0,0,0,0,0,1
...,...,...,...,...,...,...
95,1,0,0,0,0,0
96,0,0,0,0,1,0
97,0,1,0,0,0,0
98,1,0,0,0,0,0


As this is a very long procedure, pandas have given us a better function which performs the same activity

In [49]:
data_dummy = pd.get_dummies(data['parental level of education'])

data_dummy.head()

Unnamed: 0,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
0,1,0,0,0,0,0
1,0,0,1,0,0,0
2,0,0,0,0,0,1
3,1,0,0,0,0,0
4,0,0,0,0,0,1


In [50]:
data = pd.concat([data, data_dummy], axis=1)

In [51]:
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,associate's degree,bachelor's degree,high school,master's degree,some college,some high school
0,female,group B,associate's degree,standard,none,55,61,58,1,0,0,0,0,0
1,female,group B,high school,free/reduced,none,59,70,66,0,0,1,0,0,0
2,female,group D,some high school,free/reduced,none,17,18,26,0,0,0,0,0,1
3,male,group A,associate's degree,free/reduced,none,63,61,58,1,0,0,0,0,0
4,female,group E,some high school,standard,none,70,80,79,0,0,0,0,0,1
