One Hot Encoding - if categories have no relation  
Ordinal Encoding - if categories have ordered relation  
Label Encoding - only applied to target column  

In [118]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder

In [119]:
df = pd.read_csv('../datasets/customer.csv')
df.sample()

Unnamed: 0,age,gender,review,education,purchased
49,25,Female,Good,UG,No


In [120]:
df = df.iloc[:,2:]
df.sample()

Unnamed: 0,review,education,purchased
28,Poor,School,No


In [121]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:2], df.iloc[:,-1], test_size=0.25, random_state=99)
x_train.shape, x_test.shape

((37, 2), (13, 2))

In [122]:
oe = OrdinalEncoder(categories=[['Poor','Average','Good'],['School','UG','PG']])
oe.fit(x_train)

0,1,2
,categories,"[['Poor', 'Average', ...], ['School', 'UG', ...]]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,


In [123]:
x_train = oe.transform(x_train)
x_test = oe.transform(x_test)

In [124]:
le = LabelEncoder()
le.fit(y_train)

In [125]:
le.classes_

array(['No', 'Yes'], dtype=object)

In [126]:
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [127]:
df = pd.read_csv('../datasets/cars.csv')
df.sample(3)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
6820,Toyota,23000,Diesel,First Owner,1750000
4164,Mahindra,110000,Diesel,First Owner,310000
5796,Volkswagen,120000,Petrol,Second Owner,300000


In [128]:
df['brand'].nunique()

32

In [129]:
df['fuel'].value_counts()

fuel
Diesel    4402
Petrol    3631
CNG         57
LPG         38
Name: count, dtype: int64

In [130]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

In [131]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


In [132]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [133]:
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,0:4], df.iloc[:,-1], test_size=0.25, random_state=99)

In [134]:
ohe = OneHotEncoder(drop='first')
x_train_new = ohe.fit_transform(x_train[['fuel', 'owner']]).toarray()
x_test_new = ohe.transform(x_test[['fuel', 'owner']]).toarray()

In [135]:
np.hstack((x_train[['brand', 'km_driven']].values, x_train_new)).shape

(6096, 9)

In [138]:
counts = df['brand'].value_counts()
threshold = 100

In [139]:
repl = counts[counts <= threshold].index

In [140]:
pd.get_dummies(df['brand'].replace(repl, 'uncommon')).sample(5)

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
8052,False,False,False,True,False,False,False,False,False,False,False,False,False
3317,False,False,False,False,False,False,True,False,False,False,False,False,False
2232,False,False,False,False,False,False,True,False,False,False,False,False,False
2317,False,False,False,False,True,False,False,False,False,False,False,False,False
3327,False,False,False,False,False,False,True,False,False,False,False,False,False
