
# Section 5: Machine Learning
Using the dataset from https://archive.ics.uci.edu/ml/datasets/Car+Evaluation, create a machine learning model to predict the buying price given the following parameters:


*   Maintenance = High
*   Number of doors = 4
*  Lug Boot Size = Big
*  Safety = High
*  Class Value = Good


Note: please also export and submit your notebook as pdf.

In [1]:
import requests
import pandas as pd
import json
from sklearn.model_selection import train_test_split

# Get Data

In [34]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data',header=0,names=['buying','maint','doors','pers','lug_boot','safety','accept'])

In [35]:
df = df.drop('pers',axis=1) # Decision problem doesn't include pers
df.head()

Unnamed: 0,buying,maint,doors,lug_boot,safety,accept
0,vhigh,vhigh,2,small,med,unacc
1,vhigh,vhigh,2,small,high,unacc
2,vhigh,vhigh,2,med,low,unacc
3,vhigh,vhigh,2,med,med,unacc
4,vhigh,vhigh,2,med,high,unacc


# Preprocessing
Use encoders to map labels into numeric values for models to use

In [36]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['buying'] = le.fit_transform(df['buying'])

In [37]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(df[["lug_boot","accept","doors"]])
df[["lug_boot","accept","doors"]] = enc.transform(df[["lug_boot","accept","doors"]])

In [38]:
# One-hot encoding multiple columns
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    (OneHotEncoder(), ['safety',"maint"]),
    remainder='passthrough')

transformed = transformer.fit_transform(df)
df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
df.head()

Unnamed: 0,onehotencoder__safety_high,onehotencoder__safety_low,onehotencoder__safety_med,onehotencoder__maint_high,onehotencoder__maint_low,onehotencoder__maint_med,onehotencoder__maint_vhigh,remainder__buying,remainder__doors,remainder__lug_boot,remainder__accept
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,2.0,2.0
1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,2.0,2.0
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0
4,1.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,1.0,2.0


# Split Train-Test Data

In [39]:
X = df.drop("remainder__buying",axis=1)
y = df['remainder__buying']
 
# split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

In [40]:
X_train.head()

Unnamed: 0,onehotencoder__safety_high,onehotencoder__safety_low,onehotencoder__safety_med,onehotencoder__maint_high,onehotencoder__maint_low,onehotencoder__maint_med,onehotencoder__maint_vhigh,remainder__doors,remainder__lug_boot,remainder__accept
580,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
1214,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0
1664,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0
661,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
587,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,2.0


# Model selection
Select the best model to be used for prediction

In [41]:
#Baseline score using DummyClassifier
from sklearn.dummy import DummyClassifier

dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

dummy_clf.score(X_test, y_test)

0.24084778420038536

In [42]:
# Using DecisionTree
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)

cross_val_score(clf, X_train, y_train, cv=10)

array([0.10743802, 0.18181818, 0.19834711, 0.18181818, 0.14876033,
       0.19834711, 0.1322314 , 0.15702479, 0.09166667, 0.15      ])

In [43]:
# Using AdaBoost
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
clf = AdaBoostClassifier(n_estimators=100)
cross_val_score(clf, X_train, y_train, cv=5)

array([0.34297521, 0.29752066, 0.30165289, 0.26970954, 0.31120332])

# Predict
Predict Buying cost of

*   Maintenance = High
*   Number of doors = 4
*  Lug Boot Size = Big
*  Safety = High
*  Class Value = Good

using AdaBoostClassifier

In [44]:
# create test df
data = {'maint': ['high'], 'doors': ['4'],'lug_boot':['big'],'safety':['high'],'accept' :['good'],'buying':['null']}
df_test = pd.DataFrame.from_dict(data)
df_test

Unnamed: 0,maint,doors,lug_boot,safety,accept,buying
0,high,4,big,high,good,


In [46]:
# Preprocess data
df_test[["lug_boot","accept","doors"]] = enc.transform(df_test[["lug_boot","accept","doors"]])
transformed_test = transformer.transform(df_test)
df_test = pd.DataFrame(transformed_test, columns=transformer.get_feature_names_out())
df_test

In [45]:
# Build Model
clf = AdaBoostClassifier(n_estimators=100)
clf.fit(X,y)

AdaBoostClassifier(n_estimators=100)

In [49]:
res = clf.predict(df_test.drop('remainder__buying',axis=1))
res = le.inverse_transform(res.astype(int))

In [52]:
print(f'Predicted Buying cost = {res}')

Predicted Buying cost = ['low']
