## Encoding Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("./DATA/customer.csv")
df.sample(5)

Unnamed: 0,age,gender,review,education,purchased
41,23,Male,Good,PG,Yes
38,45,Female,Good,School,No
17,22,Female,Poor,UG,Yes
25,57,Female,Good,School,No
44,77,Female,Average,UG,No


In [3]:
df_ordinal = df.iloc[:, 2:]
df_ordinal.columns

Index(['review', 'education', 'purchased'], dtype='object')

In [4]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df_ordinal.iloc[:, 0:2,], df_ordinal.iloc[:, -1], test_size=0.1)

X_train.columns

Index(['review', 'education'], dtype='object')

In [15]:
from sklearn.preprocessing import OrdinalEncoder

ord_encoder = OrdinalEncoder(categories=[["Poor", "Average", "Good"], ["School", "UG", "PG"]])

# give poor less value and good high value similar to other column

In [16]:
ord_encoder.fit(X_train)

In [17]:
X_train_transform = ord_encoder.transform(X_train)
X_test_transform = ord_encoder.transform(X_test)

In [23]:
X_train_transform.shape

(45, 2)

In [25]:
X_train_transform[:10, :]

array([[0., 2.],
       [0., 2.],
       [0., 2.],
       [2., 2.],
       [2., 2.],
       [1., 1.],
       [1., 0.],
       [0., 2.],
       [1., 1.],
       [0., 2.]])

## Label Encoding

In [27]:
# use label encoding only for target column

from sklearn.preprocessing import LabelEncoder

labe_encoder = LabelEncoder()

labe_encoder.fit(y_train)

In [28]:
y_train = labe_encoder.transform(y_train)
y_test = labe_encoder.transform(y_test)

y_train[:10]

array([1, 1, 0, 0, 0, 1, 0, 0, 0, 1])

## One Hot Encoding

In [29]:
df_new = pd.read_csv("./DATA/cars.csv")

In [31]:
df_new.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
117,Maruti,40000,Petrol,First Owner,595000
4330,Maruti,5621,Petrol,First Owner,650000
3190,Maruti,23700,Petrol,First Owner,470000
3484,Maruti,35000,Petrol,First Owner,190000
4237,Tata,110000,Diesel,First Owner,503000


In [33]:
df_new["brand"].nunique()

32

In [34]:
df_new["fuel"].nunique()

4

## OHE using Pandas

In [36]:
pd.get_dummies(df_new, columns=["fuel", "owner"]) # not considering because brand has high categories in out dataset

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [38]:
pd.get_dummies(df_new, columns=["fuel", "owner"], dtype=np.int64)  # Ensures numeric output

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,0,1,0,0,1,0,0,0,0
1,Skoda,120000,370000,0,1,0,0,0,0,1,0,0
2,Honda,140000,158000,0,0,0,1,0,0,0,0,1
3,Hyundai,127000,225000,0,1,0,0,1,0,0,0,0
4,Maruti,120000,130000,0,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,0,1,1,0,0,0,0
8124,Hyundai,119000,135000,0,1,0,0,0,1,0,0,0
8125,Maruti,120000,382000,0,1,0,0,1,0,0,0,0
8126,Tata,25000,290000,0,1,0,0,1,0,0,0,0


In [40]:
pd.get_dummies(df_new, columns=["fuel", "owner"], dtype=np.int64, drop_first=True) # not considering because brand has high categories in out dataset

# drop_first = Dummy variable trap

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,1,0,0,0,0,0,0
1,Skoda,120000,370000,1,0,0,0,1,0,0
2,Honda,140000,158000,0,0,1,0,0,0,1
3,Hyundai,127000,225000,1,0,0,0,0,0,0
4,Maruti,120000,130000,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,0,0,1,0,0,0,0
8124,Hyundai,119000,135000,1,0,0,1,0,0,0
8125,Maruti,120000,382000,1,0,0,0,0,0,0
8126,Tata,25000,290000,1,0,0,0,0,0,0


## OHE scikit learn

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_new.iloc[:, 0:4], df_new.iloc[:, -1], test_size=0.1 )

In [53]:
X_train.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner
7030,Hyundai,15000,Petrol,First Owner
7536,Maruti,84000,Petrol,Second Owner
461,Toyota,250000,Diesel,First Owner
4149,Maruti,110000,Petrol,First Owner
620,Maruti,30000,Petrol,First Owner


In [54]:
from sklearn.preprocessing import OneHotEncoder

In [57]:
ohe = OneHotEncoder(drop="first") # dummy variable trap

In [58]:
X_train_new = ohe.fit_transform(X_train[["fuel", "owner"]]).toarray()
X_train_new.shape

(7315, 7)

In [59]:
X_test_new = ohe.transform(X_test[["fuel", "owner"]]).toarray()
X_test_new.shape

(813, 7)

In [60]:
np.hstack((X_train[["brand", "km_driven"]].values, X_train_new)).shape

(7315, 9)

## OHE for top categories

In [69]:
counts = df_new["brand"].value_counts()
counts[:15]

brand
Maruti        2448
Hyundai       1415
Mahindra       772
Tata           734
Toyota         488
Honda          467
Ford           397
Chevrolet      230
Renault        228
Volkswagen     186
BMW            120
Skoda          105
Nissan          81
Jaguar          71
Volvo           67
Name: count, dtype: int64

In [65]:
threshold = 100

repl = counts[counts <= threshold].index

In [66]:
repl

Index(['Nissan', 'Jaguar', 'Volvo', 'Datsun', 'Mercedes-Benz', 'Fiat', 'Audi',
       'Lexus', 'Jeep', 'Mitsubishi', 'Force', 'Land', 'Isuzu', 'Kia',
       'Ambassador', 'Daewoo', 'MG', 'Ashok', 'Opel', 'Peugeot'],
      dtype='object', name='brand')

In [68]:
pd.get_dummies(df_new["brand"].replace(repl, "uncommon"))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
