Source Link: https://www.youtube.com/watch?v=U5oCv3JKWKA&list=WL&index=347

One Hot Encoder

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('cars.csv')
df

Unnamed: 0,Brand,KMs Driven,Fuel,Owner,Price
0,Toyota,1,Diesel,First Owner,2100000
1,Suzuki,100000,Petrol,Second Owner,380000
2,Suzuki,12345,CNG,Third Owner,340000
3,Suzuki,94000,Petrol,Fourth and Above Owner,535000
4,Toyota,100000,Petrol,Test Drive Car,1430000
...,...,...,...,...,...
95,Suzuki,85000,CNG,First Owner,440000
96,Suzuki,52000,CNG,Second Owner,400000
97,Suzuki,1,CNG,Third Owner,305000
98,Suzuki,48000,Petrol,Fourth and Above Owner,450000


In [3]:
df['Owner'].unique()

array(['First Owner', 'Second Owner', 'Third Owner',
       'Fourth and Above Owner', 'Test Drive Car'], dtype=object)

In [4]:
df['Owner'].value_counts()

Owner
First Owner               20
Second Owner              20
Third Owner               20
Fourth and Above Owner    20
Test Drive Car            20
Name: count, dtype: int64

In [5]:
df.dtypes

Brand         object
KMs Driven     int64
Fuel          object
Owner         object
Price          int64
dtype: object

In [6]:
df.shape

(100, 5)

In [7]:
df.isnull()

Unnamed: 0,Brand,KMs Driven,Fuel,Owner,Price
0,False,False,False,False,False
1,False,False,False,False,False
2,False,False,False,False,False
3,False,False,False,False,False
4,False,False,False,False,False
...,...,...,...,...,...
95,False,False,False,False,False
96,False,False,False,False,False
97,False,False,False,False,False
98,False,False,False,False,False


In [8]:
df.isnull().sum()

Brand         0
KMs Driven    0
Fuel          0
Owner         0
Price         0
dtype: int64

In [9]:
df.isnull().sum().sum()

0

Note: Do Not Use 'pd.get_dummies' in ML

pd.get_dummies(df, columns=['Fuel', 'Owner']) # Pandas One Hot Encoder

pd.get_dummies(df, columns = ['Fuel', 'Owner'], drop_first=True) # Drop First Two item of 'Fuel' and 'Owner' Column

In [10]:
df.head()

Unnamed: 0,Brand,KMs Driven,Fuel,Owner,Price
0,Toyota,1,Diesel,First Owner,2100000
1,Suzuki,100000,Petrol,Second Owner,380000
2,Suzuki,12345,CNG,Third Owner,340000
3,Suzuki,94000,Petrol,Fourth and Above Owner,535000
4,Toyota,100000,Petrol,Test Drive Car,1430000


Train and Test for One Hot Encoder

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 0:4], df.iloc[:, -1], test_size = 0.2, random_state = 50)

In [13]:
X_train.head()

Unnamed: 0,Brand,KMs Driven,Fuel,Owner
44,Suzuki,73000,CNG,Test Drive Car
72,Toyota,130000,Petrol,Third Owner
59,Daihatsu,100,CNG,Test Drive Car
91,Suzuki,83000,Petrol,Second Owner
98,Suzuki,48000,Petrol,Fourth and Above Owner


Apply One Hot Encoder

In [4]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse=False, dtype=np.int32)

In [5]:
X_train_ohe = ohe.fit_transform(X_train[['Fuel', 'Owner']])
X_train_ohe



array([[0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0],
       [0,

In [9]:
X_test_ohe = ohe.transform(X_test[['Fuel', 'Owner']])
X_test_ohe

array([[0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 1, 0]])

Append 'Brand' and 'KMs Driven' with 'X_train_ohe'

In [45]:
np.hstack((X_train[['Brand', 'KMs Driven']].values, X_train_ohe))

array([['Suzuki', 73000, 0, 0, 0, 0, 0, 0, 1, 0],
       ['Toyota', 130000, 0, 0, 0, 1, 0, 0, 0, 1],
       ['Daihatsu', 100, 0, 0, 0, 0, 0, 0, 1, 0],
       ['Suzuki', 83000, 0, 0, 0, 1, 0, 1, 0, 0],
       ['Suzuki', 48000, 0, 0, 0, 1, 1, 0, 0, 0],
       ['Honda', 170000, 0, 0, 0, 0, 0, 0, 1, 0],
       ['KIA', 123456, 0, 0, 0, 0, 0, 1, 0, 0],
       ['Toyota', 78000, 0, 0, 0, 1, 0, 0, 0, 1],
       ['Daihatsu', 71000, 0, 0, 0, 0, 1, 0, 0, 0],
       ['Toyota', 100000, 0, 0, 0, 0, 0, 0, 0, 1],
       ['Suzuki', 85000, 0, 0, 0, 0, 0, 0, 0, 0],
       ['Suzuki', 90000, 0, 0, 0, 1, 0, 1, 0, 0],
       ['Toyota', 130000, 0, 0, 0, 1, 0, 1, 0, 0],
       ['Daihatsu', 88000, 0, 0, 0, 1, 0, 0, 1, 0],
       ['Other Brands', 38000, 1, 0, 0, 0, 0, 0, 0, 0],
       ['Toyota', 69000, 0, 0, 0, 1, 0, 0, 0, 1],
       ['Suzuki', 98000, 0, 0, 0, 0, 0, 0, 0, 1],
       ['Honda', 19000, 0, 1, 0, 0, 1, 0, 0, 0],
       ['Suzuki', 70, 0, 0, 0, 0, 0, 0, 0, 1],
       ['Suzuki', 1, 0, 0, 0, 0, 0, 0, 0, 1

Create a New column for less quantity Brand using 'Threshold'   Note: New Column Name will be 'uncommon'

In [48]:
counts = df['Brand'].value_counts()
counts

Brand
Suzuki          48
Toyota          26
Honda           11
Daihatsu         7
Mitsubishi       2
Other Brands     2
KIA              1
Nissan           1
BMW              1
Mazda            1
Name: count, dtype: int64

In [59]:
df['Brand'].nunique()
threshold = 10

In [60]:
replace_with = counts[counts <= threshold].index
replace_with

Index(['Daihatsu', 'Mitsubishi', 'Other Brands', 'KIA', 'Nissan', 'BMW',
       'Mazda'],
      dtype='object', name='Brand')

In [62]:
pd.get_dummies(df['Brand'].replace(replace_with, 'uncommon')).sample(50)

Unnamed: 0,Honda,Suzuki,Toyota,uncommon
33,False,True,False,False
39,False,False,True,False
9,True,False,False,False
55,False,False,True,False
65,False,True,False,False
57,True,False,False,False
29,False,False,False,True
67,False,False,True,False
48,False,False,True,False
53,False,True,False,False
