In [1]:
import pandas as pd 
import numpy as np
import os

In [2]:
df = pd.read_csv('./car+evaluation/car.csv')

In [3]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class values
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   buying        1728 non-null   object
 1   maint         1728 non-null   object
 2   doors         1728 non-null   object
 3   persons       1728 non-null   object
 4   lug_boot      1728 non-null   object
 5   safety        1728 non-null   object
 6   class values  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


---------

# **Transforming Doors Columns**

In [5]:
df['doors'].value_counts()

doors
2        432
3        432
4        432
5more    432
Name: count, dtype: int64

In [6]:
df['doors'] = df['doors'].str.split('m').str[0].astype(int)

In [7]:
df['doors'].value_counts()

doors
2    432
3    432
4    432
5    432
Name: count, dtype: int64

---------

## **Transforming Person Columns**

In [8]:
df['persons'].value_counts()

persons
2       576
4       576
more    576
Name: count, dtype: int64

In [9]:
df['persons'].mask(df['persons'] == 'more', '6', inplace=True)

In [10]:
df['persons'].value_counts()

persons
2    576
4    576
6    576
Name: count, dtype: int64

In [11]:
df['persons'] = df['persons'].astype(int)

------

# **Applying Ordinal Encoding**

In [12]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class values
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [13]:
from sklearn.preprocessing import OrdinalEncoder

ordinal = OrdinalEncoder()

In [14]:
encode = ordinal.fit_transform(df[['buying','maint','lug_boot','safety','class values']])

In [15]:
encode

array([[3., 3., 2., 1., 2.],
       [3., 3., 2., 2., 2.],
       [3., 3., 2., 0., 2.],
       ...,
       [1., 1., 0., 1., 2.],
       [1., 1., 0., 2., 1.],
       [1., 1., 0., 0., 3.]])

In [16]:
encode_df =  pd.DataFrame(encode,columns=ordinal.get_feature_names_out())

In [17]:
encode_df

Unnamed: 0,buying,maint,lug_boot,safety,class values
0,3.0,3.0,2.0,1.0,2.0
1,3.0,3.0,2.0,2.0,2.0
2,3.0,3.0,2.0,0.0,2.0
3,3.0,3.0,1.0,1.0,2.0
4,3.0,3.0,1.0,2.0,2.0
...,...,...,...,...,...
1723,1.0,1.0,1.0,2.0,1.0
1724,1.0,1.0,1.0,0.0,3.0
1725,1.0,1.0,0.0,1.0,2.0
1726,1.0,1.0,0.0,2.0,1.0


In [18]:
df = df.drop(['buying','maint','lug_boot','safety','class values'],axis=1)

In [19]:
df = pd.concat([df,encode_df],axis=1)

In [20]:
df.head()

Unnamed: 0,doors,persons,buying,maint,lug_boot,safety,class values
0,2,2,3.0,3.0,2.0,1.0,2.0
1,2,2,3.0,3.0,2.0,2.0,2.0
2,2,2,3.0,3.0,2.0,0.0,2.0
3,2,2,3.0,3.0,1.0,1.0,2.0
4,2,2,3.0,3.0,1.0,2.0,2.0


In [21]:
df['buying'].value_counts()

buying
3.0    432
0.0    432
2.0    432
1.0    432
Name: count, dtype: int64

In [22]:
df['lug_boot'].value_counts()

lug_boot
2.0    576
1.0    576
0.0    576
Name: count, dtype: int64

In [23]:
df['safety'].value_counts()

safety
1.0    576
2.0    576
0.0    576
Name: count, dtype: int64

In [24]:
df['maint'].value_counts()

maint
3.0    432
0.0    432
2.0    432
1.0    432
Name: count, dtype: int64

In [27]:
df['class values'].value_counts()

class values
2.0    1210
0.0     384
1.0      69
3.0      65
Name: count, dtype: int64

## upsampling the class values 

In [31]:
calss_1 = df[df['class values'] == 1.0]
class_0 = df[df['class values'] == 0.0]
class_2 = df[df['class values'] == 2.0]
class_3 = df[df['class values'] == 3.0]

In [32]:
print(class_0.shape)
print(calss_1.shape)
print(class_2.shape)
print(class_3.shape)

(384, 7)
(69, 7)
(1210, 7)
(65, 7)


In [28]:
from sklearn.utils import resample

In [36]:
class1_resample = resample(calss_1, replace=True,
                    n_samples= len(class_2),
                    random_state=42
                    )

In [37]:
class3_resample = resample(class_3, replace=True,
                    n_samples= len(class_2),
                    random_state=42
                    )

In [38]:
class0_resample = resample(class_0, replace=True,
                    n_samples= len(class_2),
                    random_state=42
                    )

In [39]:
class0_resample

Unnamed: 0,doors,persons,buying,maint,lug_boot,safety,class values
638,5,4,0.0,0.0,0.0,0.0,0.0
1468,4,4,1.0,0.0,2.0,2.0,0.0
1145,4,4,2.0,2.0,2.0,0.0,0.0
646,5,6,0.0,0.0,0.0,2.0,0.0
431,5,6,3.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
1201,2,4,2.0,1.0,1.0,2.0,0.0
1657,3,4,1.0,1.0,2.0,2.0,0.0
376,3,6,3.0,1.0,0.0,2.0,0.0
311,5,4,3.0,2.0,1.0,0.0,0.0


In [41]:
frames = [class0_resample,class1_resample,class_2,class3_resample]

final_df = pd.concat(frames)

In [43]:
final_df.shape

(4840, 7)

In [44]:
final_df.head()

Unnamed: 0,doors,persons,buying,maint,lug_boot,safety,class values
638,5,4,0.0,0.0,0.0,0.0,0.0
1468,4,4,1.0,0.0,2.0,2.0,0.0
1145,4,4,2.0,2.0,2.0,0.0,0.0
646,5,6,0.0,0.0,0.0,2.0,0.0
431,5,6,3.0,1.0,0.0,0.0,0.0


In [45]:
final_df['class values'].value_counts()

class values
0.0    1210
1.0    1210
2.0    1210
3.0    1210
Name: count, dtype: int64

In [47]:
final_df.to_csv('./car+evaluation/Car_Evaluation_final.csv',index=False)