# One-Hot-Encoding

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('cars.csv')
df.sample(5)

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
8045,Renault,20171,Diesel,First Owner,750000
5413,Maruti,60000,Petrol,Third Owner,125000
6167,Lexus,20000,Petrol,First Owner,5150000
4137,Maruti,15000,Petrol,First Owner,282000
5355,Jeep,87500,Diesel,First Owner,1550000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8128 entries, 0 to 8127
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   brand          8128 non-null   object
 1   km_driven      8128 non-null   int64 
 2   fuel           8128 non-null   object
 3   owner          8128 non-null   object
 4   selling_price  8128 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 317.6+ KB


# Columns Filter
> `Brand` is a *Nominal Data* ✅</br>
> `Km_Driven` is a *Numarical* ❌</br>
> `Fuel` is *Nominal Data* ✅</br>
> `Owner` is *Nominal Data* ✅</br>
> `Selling_Price` is *Numarical* ❌</br>

In [10]:
df.nunique()

Unnamed: 0,0
brand,32
km_driven,921
fuel,4
owner,5
selling_price,677


In [26]:
df['fuel'].value_counts()

Unnamed: 0_level_0,count
fuel,Unnamed: 1_level_1
Diesel,4402
Petrol,3631
CNG,57
LPG,38


In [27]:
df.owner.value_counts()

Unnamed: 0_level_0,count
owner,Unnamed: 1_level_1
First Owner,5289
Second Owner,2105
Third Owner,555
Fourth & Above Owner,174
Test Drive Car,5


# One_Hot_Encoding using Pandas

In [13]:
df['brand'].value_counts()

Unnamed: 0_level_0,count
brand,Unnamed: 1_level_1
Maruti,2448
Hyundai,1415
Mahindra,772
Tata,734
Toyota,488
Honda,467
Ford,397
Chevrolet,230
Renault,228
Volkswagen,186


In [16]:
pd.get_dummies(df, columns=['fuel', 'owner'])

Unnamed: 0,brand,km_driven,selling_price,fuel_CNG,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_First Owner,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,False,True,False,False,True,False,False,False,False
1,Skoda,120000,370000,False,True,False,False,False,False,True,False,False
2,Honda,140000,158000,False,False,False,True,False,False,False,False,True
3,Hyundai,127000,225000,False,True,False,False,True,False,False,False,False
4,Maruti,120000,130000,False,False,False,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,False,True,True,False,False,False,False
8124,Hyundai,119000,135000,False,True,False,False,False,True,False,False,False
8125,Maruti,120000,382000,False,True,False,False,True,False,False,False,False
8126,Tata,25000,290000,False,True,False,False,True,False,False,False,False


### Shape calculate
```
total column->5
fuel column -> 4
owner col -> 5

4 columns instead of 1 column (5-1+4 = 8 col)
5 columns instead of 1 column (8-1+5 = 12)
```

In [28]:
pd.get_dummies(df, columns=['fuel', 'owner']).shape

(8128, 12)



## K-1 OHE
> Remove first Column: `fuel_CNG` <br>
> Remove first Column: `owner_First Owner`

In [17]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True)

Unnamed: 0,brand,km_driven,selling_price,fuel_Diesel,fuel_LPG,fuel_Petrol,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,Maruti,145500,450000,True,False,False,False,False,False,False
1,Skoda,120000,370000,True,False,False,False,True,False,False
2,Honda,140000,158000,False,False,True,False,False,False,True
3,Hyundai,127000,225000,True,False,False,False,False,False,False
4,Maruti,120000,130000,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
8123,Hyundai,110000,320000,False,False,True,False,False,False,False
8124,Hyundai,119000,135000,True,False,False,True,False,False,False
8125,Maruti,120000,382000,True,False,False,False,False,False,False
8126,Tata,25000,290000,True,False,False,False,False,False,False


In [29]:
pd.get_dummies(df, columns=['fuel', 'owner'], drop_first=True).shape

(8128, 10)

# OHE using sklearn

## Train_Test Split

In [7]:
X = df.iloc[:, :4]
y = df.iloc[:, -1]

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y,test_size=0.2)

In [9]:
X_train.head()

Unnamed: 0,brand,km_driven,fuel,owner
7383,Hyundai,60000,Petrol,First Owner
3046,Maruti,80000,Petrol,Fourth & Above Owner
1559,Tata,15000,Petrol,First Owner
1445,Maruti,43978,Diesel,Second Owner
1371,Hyundai,25471,Diesel,First Owner


In [19]:
y_test.head()

Unnamed: 0,selling_price
6635,575000
1292,600000
234,300000
1903,660000
397,340000


## fit and transform

In [30]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(drop='first', sparse_output=False, dtype=np.int32)

ohe.fit(df[['fuel', 'owner']])
X_train_new = ohe.transform(X_train[['fuel', 'owner']])
X_test_new = ohe.transform(X_test[['fuel', 'owner']])

### Shape Calculation
```
total column-> 2 (Fuel and owner)
fuel column -> 4
owner col -> 5

total column --> 7
```

In [31]:
X_train_new.shape

(6502, 7)

In [32]:
X_test_new.shape

(1626, 7)

## Merge main ddataframe with these column

### Convert dataframe to numpy array for merge

In [36]:
ohe_array = X_train[['brand','km_driven']].values
ohe_array

array([['Hyundai', 60000],
       ['Maruti', 80000],
       ['Tata', 15000],
       ...,
       ['Ford', 50000],
       ['Hyundai', 10000],
       ['Honda', 120000]], dtype=object)

### Merge two numpy array

In [37]:
np.ALLOW_THREADS = True
np.hstack((ohe_array, X_train_new))

array([['Hyundai', 60000, 0, ..., 0, 0, 0],
       ['Maruti', 80000, 0, ..., 0, 0, 0],
       ['Tata', 15000, 0, ..., 0, 0, 0],
       ...,
       ['Ford', 50000, 1, ..., 0, 0, 0],
       ['Hyundai', 10000, 0, ..., 0, 0, 0],
       ['Honda', 120000, 1, ..., 0, 0, 0]], dtype=object)

# OHE on Brand (Most Frequent Data)

In [41]:
counts = df['brand'].value_counts()

In [40]:
df['brand'].nunique()

32

In [46]:
rpls = counts[counts <= 100].index
len(rpls)

20

In [47]:
pd.get_dummies(df['brand'].replace(rpls, 'uncommon'))

Unnamed: 0,BMW,Chevrolet,Ford,Honda,Hyundai,Mahindra,Maruti,Renault,Skoda,Tata,Toyota,Volkswagen,uncommon
0,False,False,False,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8123,False,False,False,False,True,False,False,False,False,False,False,False,False
8124,False,False,False,False,True,False,False,False,False,False,False,False,False
8125,False,False,False,False,False,False,True,False,False,False,False,False,False
8126,False,False,False,False,False,False,False,False,False,True,False,False,False
