In [25]:
# importing packages

import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

from collections import Counter

import pickle

In [26]:
# reading data

df = pd.read_csv('../data/car.data', header=None)
df.columns = ['buying','maint','doors','persons','lug_boot','safety','class']
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [27]:
# check for missing data

df.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [28]:
# split data into train_set and test_set

train_set, test_set = train_test_split(df, test_size=0.2, random_state=42, stratify=df['buying'])

In [29]:
# fit and transform train data using one hot encoder

X_train = train_set.drop(['buying'], axis=1)
y_train = train_set['buying'].reset_index(drop=True)

enc = OneHotEncoder(sparse=False)
X_train_transformed = enc.fit_transform(X_train)
X_train_transformed = pd.DataFrame(X_train_transformed, columns=enc.get_feature_names_out(X_train.columns.values))

yenc = LabelEncoder()
y_train_transformed = yenc.fit_transform(y_train)
y_train_transformed = pd.DataFrame(y_train_transformed, columns=['buying'])

In [30]:
# transform test data using the same one hot encoder

X_test = test_set.drop(['buying'], axis = 1)
y_test = test_set['buying'].reset_index(drop=True)

X_test_transformed = enc.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=enc.get_feature_names_out(X_train.columns.values))

y_test_transformed = yenc.fit_transform(y_test)
y_test_transformed = pd.DataFrame(y_test_transformed, columns=['buying'])

In [31]:
# check for imbalance in train data

counter = Counter(y_train)
for k,v in counter.items():
	per = v / len(y_train) * 100
	print('Class=%s, n=%d (%.3f%%)' % (k, v, per))

# note: train data is well-balanced, no resampling methods required

Class=low, n=346 (25.036%)
Class=vhigh, n=346 (25.036%)
Class=med, n=345 (24.964%)
Class=high, n=345 (24.964%)


In [32]:
# save the model to disk

filename = '../model/oneHotEncTransformer.sav'
pickle.dump(enc, open(filename, 'wb'))

filename = '../model/colEncTransformer.sav'
pickle.dump(yenc, open(filename, 'wb'))

In [33]:
y_train

0         low
1       vhigh
2         low
3         med
4         med
        ...  
1377      low
1378      low
1379    vhigh
1380     high
1381      med
Name: buying, Length: 1382, dtype: object

In [34]:
# save data to disk

X_train_transformed.to_csv('../data/X_train.csv', index=False)
X_test_transformed.to_csv('../data/X_test.csv', index=False)

y_train_transformed.to_csv('../data/y_train.csv', index=False)
y_test_transformed.to_csv('../data/y_test.csv', index=False)