In [82]:
import os
import pandas as pd

df = pd.read_csv(os.path.join(os.getcwd(), 'cars.csv'))

df.head()

Unnamed: 0,Car_ID,Brand,Model,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,1,Toyota,Corolla,2018,50000,Petrol,Manual,First,15,1498,108,5,800000
1,2,Honda,Civic,2019,40000,Petrol,Automatic,Second,17,1597,140,5,1000000
2,3,Ford,Mustang,2017,20000,Petrol,Automatic,First,10,4951,395,4,2500000
3,4,Maruti,Swift,2020,30000,Diesel,Manual,Third,23,1248,74,5,600000
4,5,Hyundai,Sonata,2016,60000,Diesel,Automatic,Second,18,1999,194,5,850000


In [83]:
X = df.drop({'Car_ID', 'Owner_Type'}, axis=1)
y = df['Owner_Type']

In [85]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.over_sampling import ADASYN

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

numerical_features, categorical_features

(['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price'],
 ['Brand', 'Model', 'Fuel_Type', 'Transmission'])

In [99]:
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

from imblearn.over_sampling import ADASYN, SMOTE

scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']

In [101]:
# random forest
pipeline_rf = Pipeline([
    ('preprocessor_rf', preprocessor),
    ('rf', RandomForestClassifier(random_state=42))
])

classification_result_rf = cross_validate(pipeline_rf, X, y, scoring=scoring)

mean_rf = {key: np.mean(value) for key, value in classification_result_rf.items()}

for metric, mean in mean_rf.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 0.1987
score_time: 0.0305
test_accuracy: 0.8900
test_precision_macro: 0.8649
test_recall_macro: 0.8667
test_f1_macro: 0.8628


In [44]:
# xgboost
y = LabelEncoder().fit_transform(y)

pipeline_XGBoost = Pipeline([
    ('preprocessor_xgboost', preprocessor),
    ('xgboost', XGBClassifier(verbosity =2, random_state=42))
])

classification_result_xgboost = cross_validate(pipeline_XGBoost, X, y, scoring=scoring)

mean_xgb = {key: value.mean() for key, value in classification_result_xgboost.items()}

for metric, mean in mean_xgb.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 0.0780
score_time: 0.0099
test_accuracy: 0.8700
test_precision_macro: 0.8460
test_recall_macro: 0.8519
test_f1_macro: 0.8452


In [54]:
# catboost

from catboost import CatBoostClassifier

pipeline_catboost = Pipeline([
    ('preprocessor', preprocessor),
    ('catboost', CatBoostClassifier(verbose=0, random_state=42))
])

classification_result_catboost = cross_validate(pipeline_catboost, X, y, scoring=scoring)

mean_catboost = {key: value.mean() for key, value in classification_result_catboost.items()}

for metric, mean in mean_catboost.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 2.3890
score_time: 0.0348
test_accuracy: 0.8500
test_precision_macro: 0.8347
test_recall_macro: 0.8370
test_f1_macro: 0.8334


In [110]:
# linear

from sklearn.linear_model import LogisticRegression

pipeline_linear = Pipeline([
    ('preprocessor', preprocessor),
    ('linear', LogisticRegression(random_state=42, multi_class='ovr', max_iter=1000))
])


classification_result_logistic = cross_validate(pipeline_linear, X, y, scoring=scoring)

mean_logistic = {key: value.mean() for key, value in classification_result_logistic.items()}

for metric, mean in mean_logistic.items():
    print(f'{metric}: {mean:.4f}')

fit_time: 0.0381
score_time: 0.0082
test_accuracy: 0.8600
test_precision_macro: 0.9008
test_recall_macro: 0.8685
test_f1_macro: 0.8773


In [94]:
class_counts = y.value_counts()
print(class_counts)


Owner_Type
First     44
Second    43
Third     13
Name: count, dtype: int64
