In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest, chi2


In [2]:
#read data file

headers=['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety','class value']
df= pd.read_csv(r'dataset/car.data', names=headers)
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class value
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# Data Exploration

In [3]:
#1728 records in total and no Null Values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   buying       1728 non-null   object
 1   maint        1728 non-null   object
 2   doors        1728 non-null   object
 3   persons      1728 non-null   object
 4   lug_boot     1728 non-null   object
 5   safety       1728 non-null   object
 6   class value  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [4]:
#check for imbalanced dataset and invalid values
#overall dataset is pretty well balanced, espeically since we are predicting the buying_price

for h in headers:
    print(f"-------{h}----------")
    print(df[h].value_counts())
    df[h]
    print("\n")


-------buying----------
vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64


-------maint----------
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64


-------doors----------
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64


-------persons----------
2       576
4       576
more    576
Name: persons, dtype: int64


-------lug_boot----------
small    576
med      576
big      576
Name: lug_boot, dtype: int64


-------safety----------
low     576
med     576
high    576
Name: safety, dtype: int64


-------class value----------
unacc    1210
acc       384
good       69
vgood      65
Name: class value, dtype: int64




In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   buying       1728 non-null   object
 1   maint        1728 non-null   object
 2   doors        1728 non-null   object
 3   persons      1728 non-null   object
 4   lug_boot     1728 non-null   object
 5   safety       1728 non-null   object
 6   class value  1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [6]:
df.columns

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety',
       'class value'],
      dtype='object')

# Data Preparation

In [7]:
#encode features vairables
X= pd.get_dummies(df[['maint', 'doors', 'lug_boot', 'safety',
       'class value']])

#reduce number of variable to avoid curse of dimenionality
#record which are neither 'lug_boot_big' nor 'lug_boot_med', would naturally be "lug_boot_small"
X= X[['maint_high', 'maint_low', 'maint_med',  
      'doors_2','doors_3', 'doors_4',
      'lug_boot_big', 'lug_boot_med', 
      'safety_high', 'safety_low', 
      'class value_acc','class value_good', 'class value_unacc']]

In [8]:
#encode target variable
le = preprocessing.LabelEncoder()
y= le.fit_transform(df[['buying']])
y

  y = column_or_1d(y, warn=True)


array([3, 3, 3, ..., 1, 1, 1])

In [9]:
#Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 32, test_size=0.3)

# K Nearest Neighbour

In [10]:
# training a KNN classifier
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 20).fit(X_train, y_train)
  

print("train accuracy is: ", knn.score(X_train, y_train))
print("test accuracy is: ", knn.score(X_test, y_test))


train accuracy is:  0.3680727874276261
test accuracy is:  0.2524084778420039


# SVM

In [11]:
from sklearn.svm import SVC
svm_model_linear = SVC(kernel = 'poly', C = 0.1).fit(X_train, y_train)
svm_model_linear.score(X_train, y_train)


print("train accuracy is: ", svm_model_linear.score(X_train, y_train))
print("test accuracy is: ", svm_model_linear.score(X_test, y_test))

train accuracy is:  0.37220843672456577
test accuracy is:  0.23699421965317918


# Decision Tree

In [12]:
# training a DescisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier(criterion='gini',max_depth=3, random_state=42)
tree_classifier.fit(X_train, y_train)

print("train accuracy is: ", tree_classifier.score(X_train, y_train))
print("test accuracy is: ", tree_classifier.score(X_test, y_test))

train accuracy is:  0.32754342431761785
test accuracy is:  0.3159922928709056


# Comment and evaluation of models

1) Decision tree offers the best accuracy of 31.6% on test set and thus will be used to predict the real data.

2) F1, precision and recall are not used here as the dataset is pretty balance- same number of y variables in each class

3) The random probability of paying 'vhigh' for a car is 25% while our best model accruacy (31%) is only slightly better than guessing. Seems like the condition of the cars can hardly be used to predict the price of the car. We can also say: historically, second hand car buyers did a bad job at valuating the vehical.




# Actual prediction

In [13]:
#Load actual X variables
real_x = pd.DataFrame([['high','4', 'big', 'high', 'good']],
                   columns=['maint', 'doors', 'lug_boot', 'safety','class value'])

#Preprocess the variables
real_x= pd.get_dummies(real_x)
missing_cols=set(X.columns)-set(real_x.columns)

for col in missing_cols:
    real_x[col]=0
    
real_x= real_x[['maint_high', 'maint_low', 'maint_med',  
      'doors_2','doors_3', 'doors_4',
      'lug_boot_big', 'lug_boot_med', 
      'safety_high', 'safety_low', 
      'class value_acc','class value_good', 'class value_unacc']]

# Predict real_x variables with decision tree model

In [16]:
real_y=tree_classifier.predict(real_x)
real_y= le.inverse_transform(real_y)

print("the predicted label is:", real_y[0])

the predicted label is: low


given:
    
Maintenance = High
Number of doors = 4
Lug Boot Size = Big
Safety = High
Class Value = Good


the predicted price is low