In [1]:
!pip install --quiet umap-learn

In [2]:
import pandas as pd

CARS = '/kaggle/input/car-evaluation-classification/cars.csv'
df = pd.read_csv(filepath_or_buffer=CARS)
TARGET = 'class'
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


Is our target class balanced?

In [3]:
df[TARGET].value_counts().to_dict()

{'unacc': 1210, 'acc': 384, 'good': 69, 'vgood': 65}

Most cars are unacceptable; we will probably have a hard time predicting good or very good cars reliably.

In [4]:
data_df = pd.get_dummies(data=df, columns=['doors', 'persons', 'buying', 'maint', 'lug_boot', 'safety'])
COLUMNS = ['doors_2', 'doors_3', 'doors_4', 'doors_5more', 'persons_2',
       'persons_4', 'persons_more', 'buying_high', 'buying_low', 'buying_med',
       'buying_vhigh', 'maint_high', 'maint_low', 'maint_med', 'maint_vhigh',
       'lug_boot_big', 'lug_boot_med', 'lug_boot_small', 'safety_high',
       'safety_low', 'safety_med']
data_df.head()

Unnamed: 0,class,doors_2,doors_3,doors_4,doors_5more,persons_2,persons_4,persons_more,buying_high,buying_low,...,maint_high,maint_low,maint_med,maint_vhigh,lug_boot_big,lug_boot_med,lug_boot_small,safety_high,safety_low,safety_med
0,unacc,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,True,False
1,unacc,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,False,False,True
2,unacc,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,True,True,False,False
3,unacc,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,True,False,False,True,False
4,unacc,True,False,False,False,True,False,False,False,False,...,False,False,False,True,False,True,False,False,False,True


In [5]:
import arrow
from umap import UMAP

time_start = arrow.now()
reducer = UMAP(random_state=2024, verbose=False, n_jobs=1, low_memory=False, n_epochs=201)
data_df[['x', 'y']] = reducer.fit_transform(X=data_df[COLUMNS])
print('done with UMAP in {}'.format(arrow.now() - time_start))

done with UMAP in 0:00:14.892960


In [6]:
from plotly import express
express.scatter(data_frame=data_df, x='x', y='y', color=TARGET, height=600)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(data_df[COLUMNS], data_df[TARGET], test_size=0.2, random_state=2024, stratify=data_df[TARGET])

logreg = LogisticRegression(max_iter=100000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f} f1: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test)), f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0))

model fit in 86 iterations
accuracy: 0.9133 f1: 0.9062
              precision    recall  f1-score   support

         acc       0.83      0.83      0.83        77
        good       0.50      0.21      0.30        14
       unacc       0.96      0.98      0.97       242
       vgood       0.75      0.92      0.83        13

    accuracy                           0.91       346
   macro avg       0.76      0.74      0.73       346
weighted avg       0.90      0.91      0.91       346



Our accuracy/f1 is good, but this is driving mostly by the fact that we are predicting unacceptable cars well.

In [8]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

MODEL = {
#     'QDA': QuadraticDiscriminantAnalysis(),
    'Naive Bayes': GaussianNB(),
    '3 Nearest Neighbors': KNeighborsClassifier(n_neighbors=3),
    'Linear SVM': SVC(kernel='linear', C=0.025, random_state=2024),
#     'Gaussian Process':GaussianProcessClassifier(1.0 * RBF(1.0), random_state=2024),
    'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=2024),
    '10 estimator Random Forest': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=2024),
    'Neural Net': MLPClassifier(alpha=1, max_iter=1000, random_state=2024),
    'AdaBoost': AdaBoostClassifier(algorithm='SAMME', random_state=2024),
    'RBF SVM': SVC(gamma=2, C=1, random_state=2024),
}

for name, clf in MODEL.items():
    time_start = arrow.now()
    clf.fit(X=X_train, y=y_train)
    score = f1_score(average='micro', y_true=y_test, y_pred=clf.predict(X=X_test), )
    print('{:5.4f} {} {}'.format(score, arrow.now() - time_start, name))

0.8035 0:00:00.012859 Naive Bayes
0.8468 0:00:00.086043 3 Nearest Neighbors
0.8584 0:00:00.085004 Linear SVM
0.8671 0:00:00.014688 Decision Tree
0.7139 0:00:00.040250 10 estimator Random Forest
0.9740 0:00:03.945490 Neural Net
0.8208 0:00:00.301684 AdaBoost
0.6994 0:00:00.255730 RBF SVM
