<a href="https://colab.research.google.com/github/nramelia2/DATA-SCIENTIST/blob/main/7%20ONE%20HOT%20ENCODING/one_hot_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Prepare Data

In [None]:
#Import library
import random
import pandas as pd
import numpy as np

In [None]:
# Read raw data
data = pd.read_csv('car.data', header=None)

In [None]:
# Form a feature list C1..C27

varnames = ['C'+str(s) for s in range(1,27)]

# Set each column from existing data
# This is done because the raw data used does not have a feature title
data.columns = varnames

# Replace data with value ? with Nan value
data = data.replace('?', np.nan)

In [None]:
# Change the data type of column C10-C13, C19-C21 and C26 to float
data['C10'] = data['C10'].astype('float')
data['C11'] = data['C11'].astype('float')
data['C12'] = data['C12'].astype('float')
data['C13'] = data['C13'].astype('float')
data['C19'] = data['C19'].astype('float')
data['C20'] = data['C20'].astype('float')
data['C21'] = data['C21'].astype('float')
data['C26'] = data['C26'].astype('float')

In [None]:
# Carry out the process of mapping text data into binary form
data['C4'] = data['C4'].map({'gas':1, 'diesel':0})

In [None]:
# Create a list with data features with category types and numeric types
cat_cols = [c for c in data.columns if data[c].dtypes=='O'] 
num_cols = [c for c in data.columns if data[c].dtypes!='O']

In [None]:
# Perform input on empty data with value
# 0 for a list of numeric value data
# Missing for categorical feature column data
data[num_cols] = data[num_cols].fillna(0) 
data[cat_cols] = data[cat_cols].fillna('Missing')

# Save the transformed data in csv format
data.to_csv('car.csv', index=False)


Pelabelan Data menggunakan Teknik One-Hot Encoding

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Reads previously transformed data
data = pd.read_csv('car.csv')
data.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,3,Missing,alfa-romero,1,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,3,Missing,alfa-romero,1,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,1,Missing,alfa-romero,1,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,2,164,audi,1,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,2,164,audi,1,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [None]:
# Separate the data to be used as testing ste
X_train, X_test, y_train, y_test = train_test_split( data.drop(labels=['C4'], axis=1), data['C4'], test_size=0.3,
random_state=0)

In [None]:
# Checking what values are in column C5
X_train['C2'].unique()

array(['85', '115', '150', '128', '125', '98', '91', '103', '113', '94',
       '104', '83', '102', '161', '122', '89', 'Missing', '197', '148',
       '95', '129', '74', '192', '87', '168', '164', '119', '93', '188',
       '77', '118', '256', '81', '106', '158', '154', '65', '108', '110',
       '107', '194', '101', '137', '134', '90', '145', '142', '153', '78'],
      dtype=object)

In [None]:
tmp = pd.get_dummies(X_train['C2'], drop_first=True) 

# Displays 5 initial data from the results of the transformation carried out
tmp.head()

Unnamed: 0,102,103,104,106,107,108,110,113,115,118,...,85,87,89,90,91,93,94,95,98,Missing
40,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
60,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
56,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
86,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# To perform the process of encoding category data simultaneously, we first form a list of each column
vars_categorical = ['C3', 'C5', 'C6', 'C7', 'C8', 'C9', 'C15', 'C16', 'C18']

In [None]:
# Decoding the training and testing datasets on previously defined features/columns.
# The pandas get_dummies() function captures variable names.
X_train_enc = pd.get_dummies(X_train[vars_categorical], drop_first=True)
X_test_enc = pd.get_dummies(X_test[vars_categorical], drop_first=True)

In [None]:
X_train_enc.head()

Unnamed: 0,C3_audi,C3_bmw,C3_chevrolet,C3_dodge,C3_honda,C3_isuzu,C3_jaguar,C3_mazda,C3_mercedes-benz,C3_mitsubishi,...,C16_six,C16_twelve,C16_two,C18_2bbl,C18_4bbl,C18_idi,C18_mfi,C18_mpfi,C18_spdi,C18_spfi
40,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
60,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
56,0,0,0,0,0,0,0,1,0,0,...,0,0,1,0,1,0,0,0,0,0
101,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
86,0,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


SKLEARN Feature-Engine

In [None]:
pip install feature_engine

Note: you may need to restart the kernel to use updated packages.


In [None]:
pip install feature-engine==0.3.0

Note: you may need to restart the kernel to use updated packages.


In [None]:
from feature_engine.categorical_encoders import OneHotCategoricalEncoder

In [None]:
# set up encoder to form value k-1
ohe_enc = OneHotCategoricalEncoder(top_categories=None,drop_last=True)

In [None]:
ohe_enc.fit(X_train)

OneHotCategoricalEncoder(drop_last=True,
                         variables=['C2', 'C3', 'C5', 'C6', 'C7', 'C8', 'C9',
                                    'C15', 'C16', 'C18', 'C22', 'C23'])

In [None]:
X_train_enc = ohe_enc.transform(X_train) 
X_test_enc = ohe_enc.transform(X_test) 
X_train.head()

Unnamed: 0,C1,C2,C3,C5,C6,C7,C8,C9,C10,C11,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
40,0,85,honda,std,four,sedan,fwd,front,96.5,175.4,...,110,1bbl,3.15,3.58,9.0,86,5800,27,33,10295.0
60,0,115,mazda,std,four,sedan,fwd,front,98.8,177.8,...,122,2bbl,3.39,3.39,8.6,84,4800,26,32,8495.0
56,3,150,mazda,std,two,hatchback,rwd,front,95.3,169.0,...,70,4bbl,0.0,0.0,9.4,101,6000,17,23,11845.0
101,0,128,nissan,std,four,sedan,fwd,front,100.4,181.7,...,181,mpfi,3.43,3.27,9.0,152,5200,17,22,13499.0
86,1,125,mitsubishi,std,four,sedan,fwd,front,96.3,172.4,...,122,2bbl,3.35,3.46,8.5,88,5000,25,32,8189.0


In [None]:
X_train_enc.head()

Unnamed: 0,C1,C10,C11,C12,C13,C14,C17,C19,C20,C21,...,C23_5500,C23_5250,C23_5900,C23_4500,C23_5100,C23_4350,C23_4750,C23_6600,C23_4200,C23_5300
40,0,96.5,175.4,62.5,54.1,2372,110,3.15,3.58,9.0,...,0,0,0,0,0,0,0,0,0,0
60,0,98.8,177.8,66.5,55.5,2410,122,3.39,3.39,8.6,...,0,0,0,0,0,0,0,0,0,0
56,3,95.3,169.0,65.7,49.6,2380,70,0.0,0.0,9.4,...,0,0,0,0,0,0,0,0,0,0
101,0,100.4,181.7,66.5,55.1,3095,181,3.43,3.27,9.0,...,0,0,0,0,0,0,0,0,0,0
86,1,96.3,172.4,65.4,51.6,2405,122,3.35,3.46,8.5,...,0,0,0,0,0,0,0,0,0,0
