In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

column_names = ["buying", "maint", "doors", "persons", "lug_boot", "safety", "class"]

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data"
data = pd.read_csv(url, names=column_names)
data

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc
...,...,...,...,...,...,...,...
1723,low,low,5more,more,med,med,good
1724,low,low,5more,more,med,high,vgood
1725,low,low,5more,more,big,low,unacc
1726,low,low,5more,more,big,med,good


In [34]:
### Exploratory Data Analysis
data.shape

(1728, 7)

In [35]:
data.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


In [36]:
for col in column_names:
    print(data[col].value_counts())   

vhigh    432
high     432
med      432
low      432
Name: buying, dtype: int64
vhigh    432
high     432
med      432
low      432
Name: maint, dtype: int64
2        432
3        432
4        432
5more    432
Name: doors, dtype: int64
2       576
4       576
more    576
Name: persons, dtype: int64
small    576
med      576
big      576
Name: lug_boot, dtype: int64
low     576
med     576
high    576
Name: safety, dtype: int64
unacc    1210
acc       384
good       69
vgood      65
Name: class, dtype: int64


In [37]:
data.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [38]:
column_names

['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

In [39]:
### Feature Engineering
y = data["buying"]
x = data.drop("buying", axis=1)


In [40]:
pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [61]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

import category_encoders as ce

categorical_cols = ['maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

encoder = ce.OrdinalEncoder(cols=categorical_cols)

x_train = encoder.fit_transform(x_train)
x_test = encoder.transform(x_test)

for col in categorical_cols:
    print(x_train[col].value_counts())   


4    357
3    352
1    339
2    334
Name: maint, dtype: int64
4    349
3    346
1    345
2    342
Name: doors, dtype: int64
2    473
1    459
3    450
Name: persons, dtype: int64
2    463
1    461
3    458
Name: lug_boot, dtype: int64
3    469
2    458
1    455
Name: safety, dtype: int64
1    975
2    301
4     58
3     48
Name: class, dtype: int64


In [53]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=42)
model.fit(x_train, y_train)

predictions = model.predict(x_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

        high       0.28      0.20      0.23        92
         low       0.57      0.28      0.37        83
         med       0.15      0.22      0.18        77
       vhigh       0.37      0.50      0.42        94

    accuracy                           0.30       346
   macro avg       0.34      0.30      0.30       346
weighted avg       0.34      0.30      0.31       346



In [54]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 1, 2, 3, 4, 5, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')

grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_

print("Best parameters:", best_params)

best_model = DecisionTreeClassifier(**best_params)
best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Best parameters: {'max_depth': 3, 'min_samples_split': 2}
Accuracy: 0.30346820809248554


In [65]:
new_data = pd.DataFrame([["high", "4", "2", "big", "high", "good"]], columns=categorical_cols)

new_data_transformed = encoder.transform(new_data)

predicted_buying_price = model.predict(new_data_transformed)
predicted_series = pd.Series(predicted_buying_price)

print(predicted_series)

0    low
dtype: object
