# Convert Categorical Variables

In [1]:
'''
HOW TO Encode categorical features as a one-hot numeric array. 
This creates a binary column for each (or k-1) category and returns a sparse matrix.
'''
import pandas as pd
df = pd.read_csv("data_example.csv", sep = ",")
df

Unnamed: 0,type_of_food,some_other_attribute
0,fruit,0.3
1,vegetable,0.4
2,fruit,0.2
3,meat,0.3
4,fruit,0.6
5,vegetable,0.7
6,fruit,0.3
7,meat,0.4
8,vegetable,0.2
9,vegetable,0.3


In [7]:
# Convert categorical variable into dummy/indicator variables
dummy = pd.get_dummies(df['type_of_food'],drop_first=True)#,drop_first=True)   
#drop_first=True --> Whether to get k-1 dummies out of k categorical levels by removing the first level.

In [8]:
dummy

Unnamed: 0,meat,vegetable
0,0,0
1,0,1
2,0,0
3,1,0
4,0,0
5,0,1
6,0,0
7,1,0
8,0,1
9,0,1


In [10]:
new_df = pd.concat([df,dummy], axis = 1)

In [5]:
new_df

Unnamed: 0,type_of_food,some_other_attribute,meat,vegetable
0,fruit,0.3,0,0
1,vegetable,0.4,0,1
2,fruit,0.2,0,0
3,meat,0.3,1,0
4,fruit,0.6,0,0
5,vegetable,0.7,0,1
6,fruit,0.3,0,0
7,meat,0.4,1,0
8,vegetable,0.2,0,1
9,vegetable,0.3,0,1


In [11]:
new_df.drop("type_of_food", 1)

  new_df.drop("type_of_food", 1)


Unnamed: 0,some_other_attribute,meat,vegetable
0,0.3,0,0
1,0.4,0,1
2,0.2,0,0
3,0.3,1,0
4,0.6,0,0
5,0.7,0,1
6,0.3,0,0
7,0.4,1,0
8,0.2,0,1
9,0.3,0,1


## sklear OneHot encoder 

In [12]:
from sklearn.preprocessing import OneHotEncoder

In [13]:
encoder = OneHotEncoder(handle_unknown='ignore')


In [14]:
df[["type_of_food"]]

Unnamed: 0,type_of_food
0,fruit
1,vegetable
2,fruit
3,meat
4,fruit
5,vegetable
6,fruit
7,meat
8,vegetable
9,vegetable


In [16]:
encoder.fit(df[["type_of_food"]])


In [17]:
encoder.categories_

[array(['fruit', 'meat', 'vegetable'], dtype=object)]

In [18]:
dummy = encoder.transform(df[["type_of_food"]]).toarray()

In [19]:
dummy

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.]])

In [20]:
df_categorical=pd.DataFrame(dummy,columns=encoder.categories_)

In [21]:
df_categorical

Unnamed: 0,fruit,meat,vegetable
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,1.0,0.0,0.0
5,0.0,0.0,1.0
6,1.0,0.0,0.0
7,0.0,1.0,0.0
8,0.0,0.0,1.0
9,0.0,0.0,1.0


In [26]:
encoded=[[0, 1, 0], [1, 0, 0]]

In [27]:
print(encoder.inverse_transform(encoded))


[['meat']
 ['fruit']]
