In [176]:
import pandas as pd
import numpy as np

In [177]:
df = pd.read_csv('cars.csv')

In [178]:
df.head(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [179]:
df['owner'].value_counts()

owner
First Owner             5289
Second Owner            2105
Third Owner              555
Fourth & Above Owner     174
Test Drive Car             5
Name: count, dtype: int64

# 1. OneHotEncoding using Pandas

In [180]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


<h3>K-1 OneHotEncoding</h3>

In [181]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


# 2. One Hot Encoding Using Sklearn

In [182]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:4], df.iloc[:,-1])

In [183]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
8007,Hyundai,90000,Diesel,Second Owner
4479,Hyundai,43000,Petrol,Second Owner
2277,Maruti,112880,Diesel,First Owner
5865,Maruti,70000,Petrol,Third Owner
6800,Mahindra,18000,Diesel,First Owner


In [184]:
from sklearn.preprocessing import OneHotEncoder

In [185]:
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)

# The `drop='first'` parameter is used to avoid multicollinearity by dropping the first category of each feature.

# The `sparse_output=False` parameter ensures the output is a dense array instead of a sparse matrix for easier manipulation and integration with other data processing steps.

In [186]:
ohe.fit(X_train[['fuel','owner']])

X_train_new = ohe.transform(X_train[['fuel', 'owner']])
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

In [187]:
X_train_new.shape

(6096, 7)

In [188]:
type(X_train_new)

numpy.ndarray

In [189]:
X_train_new = X_train_new

In [190]:
type(X_train_new)

numpy.ndarray

In [191]:
np.hstack((X_train[['brand', 'km_driven']].values, X_train_new)).shape

(6096, 9)

<h4> &rarr; OneHotEncoding with top Categories</h4>

In [192]:
counts = df['brand'].value_counts()

In [193]:
df['brand'].nunique()
threshold = 100

In [194]:
to_replace = counts[counts <= threshold].index

In [202]:
pd.get_dummies(df['brand'].replace(to_replace, 'others'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,others
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
