# Vehicle MultiClass Classification

## 1. Setup and Preparation

### Install Packages

In [None]:
%pip install numpy
%pip install pandas
%pip install scikit-learn
%pip install kaggle

### Imports

In [None]:
import os
import pandas as pd
import numpy as np
from google.colab import userdata
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import (
    MultinomialNB,
    GaussianNB,
    ComplementNB,
    BernoulliNB,
)
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

### Download the Dataset

In [None]:
# These values must be set in colab's Secrets
os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

!kaggle datasets download -d austinreese/craigslist-carstrucks-data

!unzip "craigslist-carstrucks-data.zip"

### Load the Dataset with Pandas

In [None]:
df = pd.read_csv('/content/vehicles.csv')

### Explore the Dataset

In [None]:
df.shape

(426880, 26)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426880 entries, 0 to 426879
Data columns (total 26 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            426880 non-null  int64  
 1   url           426880 non-null  object 
 2   region        426880 non-null  object 
 3   region_url    426880 non-null  object 
 4   price         426880 non-null  int64  
 5   year          425675 non-null  float64
 6   manufacturer  409234 non-null  object 
 7   model         421603 non-null  object 
 8   condition     252776 non-null  object 
 9   cylinders     249202 non-null  object 
 10  fuel          423867 non-null  object 
 11  odometer      422480 non-null  float64
 12  title_status  418638 non-null  object 
 13  transmission  424324 non-null  object 
 14  VIN           265838 non-null  object 
 15  drive         296313 non-null  object 
 16  size          120519 non-null  object 
 17  type          334022 non-null  object 
 18  pain

In [None]:
df.sample(3)

Unnamed: 0,id,url,region,region_url,price,year,manufacturer,model,condition,cylinders,...,size,type,paint_color,image_url,description,county,state,lat,long,posting_date
146760,7302860898,https://springfieldil.craigslist.org/cto/d/spr...,springfield,https://springfieldil.craigslist.org,2000,2008.0,pontiac,montana sv6,excellent,6 cylinders,...,,van,red,https://images.craigslist.org/00N0N_gKj7oA4JVg...,"I have a 2002 Pontiac Montana, 132xxx miles cl...",,il,39.806,-89.586,2021-04-06T17:23:08-0500
249698,7315971740,https://cnj.craigslist.org/cto/d/old-bridge-to...,central NJ,https://cnj.craigslist.org,18800,2015.0,toyota,highlander limited pla,excellent,6 cylinders,...,full-size,SUV,white,https://images.craigslist.org/00d0d_bIq0mv0SfE...,Hello CL. Selling my 2015 Toyota Highlander Li...,,nj,40.398,-74.3236,2021-05-02T18:43:11-0400
144637,7314019272,https://rockford.craigslist.org/ctd/d/center-p...,rockford,https://rockford.craigslist.org,23900,2014.0,mercedes-benz,e-class,good,6 cylinders,...,,sedan,black,https://images.craigslist.org/00Y0Y_dmJsomJNf2...,2014 Mercedes-Benz E-Class 4dr Sdn E 350 Sport...,,il,42.1898,-91.7758,2021-04-28T15:24:58-0500


## Data Preprocessing

### Features Extraction

We should elminate the columns that are not related to the actual features of the vehicles

In [None]:
pprint(list(df.columns))

['id',
 'url',
 'region',
 'region_url',
 'price',
 'year',
 'manufacturer',
 'model',
 'condition',
 'cylinders',
 'fuel',
 'odometer',
 'title_status',
 'transmission',
 'VIN',
 'drive',
 'size',
 'type',
 'paint_color',
 'image_url',
 'description',
 'county',
 'state',
 'lat',
 'long',
 'posting_date']


Only the following columns are the ones related to the type of the vehicle:
`price`
`year`
`manufacturer`
`model`
`cylinders`
`fuel`
`transmission`

The task is to classify based on the `type`

In [None]:
features = ['price','manufacturer','model','cylinders','fuel','transmission', 'type']

for column in df.columns:
  if column not in features:
    df.drop(column, inplace=True, axis=1)

print('Columns after features extraction')
pprint(list(df.columns))

Columns after features extraction
['price', 'manufacturer', 'model', 'cylinders', 'fuel', 'transmission', 'type']


### Data Cleaning

#### Handling missing values

In [None]:
total_number_of_rows = df.shape[0]
missing_rows_count = total_number_of_rows - df.dropna().shape[0]
missing_rows_percentage = (missing_rows_count / total_number_of_rows) * 100

print("Count of rows with missing values:", missing_rows_count)
print("Percentage of rows with missing values:", missing_rows_percentage)

Count of rows with missing values: 216508
Percentage of rows with missing values: 50.71870314842579


Since the dataset is big enough we can afford to drop all null values and still have 210K+ of rows

In [None]:
cleaned_df = df.dropna()

print('Number of rows before dropping nulls:', df.shape[0])
print('Number of rows after dropping nulls:', cleaned_df.shape[0])

Number of rows before dropping nulls: 426880
Number of rows after dropping nulls: 210372


#### Data Conversion

In [None]:
cleaned_df.sample(5)

Unnamed: 0,price,manufacturer,model,cylinders,fuel,transmission,type
73490,13590,chrysler,300,6 cylinders,gas,automatic,sedan
35993,8995,mercedes-benz,e350 sport 3.5l,6 cylinders,gas,automatic,sedan
218846,34590,ford,f150 super cab xl pickup 4d,6 cylinders,gas,other,pickup
398092,5990,honda,civic lx sedan,4 cylinders,gas,automatic,sedan
409435,0,chevrolet,camaro ss,8 cylinders,gas,automatic,coupe


The following columns are categorical:

`manufacturer`, `model`, `cylinders`, `fuel`, `transmission`, `type`

In [None]:
# Check the number of unique values in each of the categorical column

categorical_columns = ['manufacturer', 'model', 'cylinders', 'fuel', 'transmission', 'type']
for column in categorical_columns:
  unique_count = len(cleaned_df[column].unique())
  print(f"Number of unique values in '{column}' column", unique_count)

Number of unique values in 'manufacturer' column 41
Number of unique values in 'model' column 16481
Number of unique values in 'cylinders' column 8
Number of unique values in 'fuel' column 5
Number of unique values in 'transmission' column 3
Number of unique values in 'type' column 13


The following columns can be one-hot encoded as they have a small number of unqiue values without ordering:

`fuel`, `transmission`

The first value can be dropped as it can be represented by all other values being False


In [None]:
cleaned_df = pd.get_dummies(cleaned_df, columns=['fuel', 'transmission'], drop_first=True)

print('Columns after applying one-hot encoding to "fuel" and "transmission"\n\n')
print(cleaned_df.columns)

Columns after applying one-hot encoding to "fuel" and "transmission"


Index(['price', 'manufacturer', 'model', 'cylinders', 'type', 'fuel_electric',
       'fuel_gas', 'fuel_hybrid', 'fuel_other', 'transmission_manual',
       'transmission_other'],
      dtype='object')


The remainging columns to convert are:

`manufacturer`, `model`, `cylinders`, `type`

> All of these columns except the `cylinders` don't have an order so we can convert them to a list of integers from 0 to the number of of unique values - 1


> The `cylinders` column does have an order in its value so we'll convert each value to the int number of cylinders or 0 when its value is not numerical (some rows have the value "other")

In [None]:
# Normalize categorical values with no ordering

columns_to_normalize = ['manufacturer', 'model', 'type']

for column in columns_to_normalize:
  cleaned_df[column] = pd.factorize(cleaned_df[column])[0]

In [None]:
# Normalize the `cylinders` column

# Replace 'other' with NaN
cleaned_df['cylinders'] = cleaned_df['cylinders'].replace('other', np.nan)

# Extract the numeric part of the 'cylinders' column
cleaned_df['cylinders'] = cleaned_df['cylinders'].str.extract('(\d+)').astype(float)

# Fill NaN values with the mode without using inplace=True
cleaned_df['cylinders'] = cleaned_df['cylinders'].fillna(0).astype(int)

In [None]:
cleaned_df.sample(10)

Unnamed: 0,price,manufacturer,model,cylinders,type,fuel_electric,fuel_gas,fuel_hybrid,fuel_other,transmission_manual,transmission_other
319668,0,0,3051,8,1,False,False,False,False,False,False
389106,18498,11,54,4,11,False,True,False,False,False,False
49874,57988,3,5047,8,1,False,False,False,False,False,False
83669,39590,3,126,8,3,False,True,False,False,False,True
109842,8550,10,3886,8,6,False,True,False,False,False,False
104819,4000,20,559,6,4,False,True,False,False,False,False
70803,5950,17,978,4,6,False,True,False,False,False,False
115650,25990,0,19,6,0,False,True,False,False,False,True
145574,8000,3,74,8,1,False,True,False,False,False,False
407270,8888,15,16078,6,6,False,True,False,False,True,False


In [None]:
cleaned_df.corr()

Unnamed: 0,price,manufacturer,model,cylinders,type,fuel_electric,fuel_gas,fuel_hybrid,fuel_other,transmission_manual,transmission_other
price,1.0,-0.002109,-0.001271,0.004169,-0.004905,-7.8e-05,-0.001441,-0.000383,-0.00028,-0.000824,-0.00051
manufacturer,-0.002109,1.0,0.134652,-0.241793,0.320769,0.025818,0.117202,-0.019378,-0.062795,0.035919,-0.037632
model,-0.001271,0.134652,1.0,0.012726,0.084536,0.029905,-0.037048,0.019799,0.001228,0.045709,-0.087614
cylinders,0.004169,-0.241793,0.012726,1.0,-0.437153,-0.08488,-0.141507,-0.098051,0.097618,-0.087986,0.145085
type,-0.004905,0.320769,0.084536,-0.437153,1.0,0.023151,0.246069,0.060999,-0.113295,0.04828,-0.215295
fuel_electric,-7.8e-05,0.025818,0.029905,-0.08488,0.023151,1.0,-0.102905,-0.003877,-0.005742,-0.007991,0.002251
fuel_gas,-0.001441,0.117202,-0.037048,-0.141507,0.246069,-0.102905,1.0,-0.335351,-0.496651,0.030577,-0.01116
fuel_hybrid,-0.000383,-0.019378,0.019799,-0.098051,0.060999,-0.003877,-0.335351,1.0,-0.018713,-0.024303,-0.020604
fuel_other,-0.00028,-0.062795,0.001228,0.097618,-0.113295,-0.005742,-0.496651,-0.018713,1.0,-0.03577,0.155198
transmission_manual,-0.000824,0.035919,0.045709,-0.087986,0.04828,-0.007991,0.030577,-0.024303,-0.03577,1.0,-0.092975


#### Splitting the data into training/testing sets

In [None]:
X = cleaned_df.drop(columns='type')
y = cleaned_df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

## Linear Regression

In [None]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data, and transform the test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
}

# Initialize Grid Search
grid_search = GridSearchCV(LogisticRegression(max_iter=1000, random_state=42), param_grid, scoring='f1_weighted', cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Report the results
print("Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Model Performance:
Accuracy: 0.4469875222816399
Precision: 0.41490618165224724
Recall: 0.4469875222816399
F1 Score: 0.3997927634356582


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



## Decision Tree

In [None]:
# Define the parameter grid
param_grid = {
    'criterion': ["gini", "entropy", "log_loss", "gini"],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize Grid Search
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, scoring='f1_weighted', cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Report the results
print("Model Performance:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Model Performance:
Accuracy: 0.822075390977801
Precision: 0.8221125870366728
Recall: 0.822075390977801
F1 Score: 0.8220374832402361


## Naive Bayes

In [None]:
naive_bayes_models = [MultinomialNB, GaussianNB, ComplementNB, BernoulliNB]

for i, naive_bayes_model in enumerate(naive_bayes_models):
  print(i)
  print("-----------------------------------------")
  nb = naive_bayes_model()
  nb.fit(X_train, y_train)

  # Predict the vehicle type on the test set
  y_pred = nb.predict(X_test)

  # Calculate evaluation metrics
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average="weighted")
  recall = recall_score(y_test, y_pred, average="weighted")
  f1 = f1_score(y_test, y_pred, average="weighted")

  # Report the results
  print("Model Performance:")
  print("Accuracy:", accuracy)
  print("Precision:", precision)
  print("Recall:", recall)
  print("F1 Score:", f1, "\n\n")

0
-----------------------------------------
Model Performance:
Accuracy: 0.1174331550802139
Precision: 0.22611673849302893
Recall: 0.1174331550802139
F1 Score: 0.07756185499173393 


1
-----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model Performance:
Accuracy: 0.3185739750445633
Precision: 0.196550429672712
Recall: 0.3185739750445633
F1 Score: 0.20203798978569118 


2
-----------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model Performance:
Accuracy: 0.20111705288175877
Precision: 0.09529399521707825
Recall: 0.20111705288175877
F1 Score: 0.1223380819862347 


3
-----------------------------------------
Model Performance:
Accuracy: 0.34167557932263815
Precision: 0.3346999740594437
Recall: 0.34167557932263815
F1 Score: 0.26923246006401724 




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
