# Selecting the best model for classifying the fuel type

### Importing the standard libraries

In [781]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.python.keras.api import keras

### Importing the dataset

In [782]:
dataset = pd.read_csv('../assets/car-details-for-ml.csv')

Moving the dependent variable to the end of the dataset and dropping irrelevant columns.

In [783]:
selling_price = dataset.pop('selling_price')

dataset = dataset.iloc[:, :11]
dataset.insert(len(dataset.columns), 'selling_price', selling_price)

dep_variable = dataset.pop('fuel')
dataset.insert(len(dataset.columns), 'fuel', dep_variable)

dataset = dataset.drop(columns=['year', 'owner', 'transmission', 'seller_type'], axis=1)

dataset.head()

Unnamed: 0,km_driven,seats,mileage,engine,max_power,nm,selling_price,fuel
0,145500,5.0,23.4,1248,74.0,190.0,450000,0
1,120000,5.0,21.14,1498,103.52,250.0,370000,0
2,140000,5.0,17.7,1497,78.0,124.54,158000,1
3,127000,5.0,23.0,1396,90.0,219.67,225000,0
4,120000,5.0,16.1,1298,88.2,112.78,130000,1


### Splitting data to train and test sets

In [784]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [785]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=73)

### Feature scaling the data

Some of the models might require the features to be scaled. Feature scaling is a common practice in machine learning and helps to normalize the features because some models are sensitive to the scale.

In [786]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

X_train_scaled = std_scaler.fit_transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

# Sample the scaled values
print(X_train_scaled[:1, :12])
print(X_test_scaled[:1, :12])

[[-0.23 -0.45  0.59 -0.9  -0.7  -0.98 -0.64]]
[[-1.02 -0.45 -0.38  1.05  3.04  2.62  5.11]]


### Defining methods for helping to evaluate the models

In [787]:
from sklearn.metrics import confusion_matrix, accuracy_score

# Confusion Matrix = Makes a matrix of the predictions and actual values 
# Accuracy score = Percentage of correct predictions

labels = ['Model', 'Accuracy', 'Confusion Matrix']
results = []

def model_evaluation(model: str, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    results.append([model, accuracy, cm])
    return [model, accuracy, cm]

In [788]:
def print_actual_vs_predictions(y_test, y_pred):
    np.set_printoptions(precision=2)
    actual_vs_pred = np.concatenate((y_test.reshape(len(y_test), 1), y_pred.reshape(len(y_pred), 1)), 1)

    print(["Actual", "Predictions"])
    print(actual_vs_pred[4:12])

### Splitting data to train and test sets

## Logistic Regression Classification

In [789]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(random_state=73)
log_reg.fit(X_train_scaled, y_train)

LogisticRegression(random_state=73)

In [790]:
y_pred = log_reg.predict(X_test_scaled)

print_actual_vs_predictions(y_test, y_pred)

['Actual', 'Predictions']
[[0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


In [791]:
log_reg_result = model_evaluation('Logistic Regression', y_test, y_pred)

for i in range(len(log_reg_result)):
    if (labels[i] == "Confusion Matrix"):
        print(labels[i], ':\n', log_reg_result[i])
    else:
        print(labels[i], ':', log_reg_result[i])

Model : Logistic Regression
Accuracy : 0.9921773142112125
Confusion Matrix :
 [[858   7]
 [  5 664]]


Already yielding fantastic results from the first model used. Only misclassifying 0.8% of the data. Let's try and get 0% misclassification rate.

## K-Nearest Neighbors Classification

In [792]:
from sklearn.neighbors import KNeighborsClassifier

knn_class = KNeighborsClassifier(n_neighbors=3, algorithm="auto")
knn_class.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [793]:
y_pred = knn_class.predict(X_test_scaled)

print_actual_vs_predictions(y_test, y_pred)

['Actual', 'Predictions']
[[0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


In [794]:
knn_class_result = model_evaluation('K-Nearest Neighbors Classification', y_test, y_pred)

for i in range(len(knn_class_result)):
    if (labels[i] == "Confusion Matrix"):
        print(labels[i], ':\n', knn_class_result[i])
    else:
        print(labels[i], ':', knn_class_result[i])

Model : K-Nearest Neighbors Classification
Accuracy : 0.9947848761408083
Confusion Matrix :
 [[863   2]
 [  6 663]]


Getting even closer towards 0% misclassification rate. However the model is still about 0.5% off. I think we can try another model to get a better result.

## Decision Tree Classification

In [795]:
from sklearn.tree import DecisionTreeClassifier

tree_class = DecisionTreeClassifier( random_state = 73)
tree_class.fit(X_train_scaled, y_train)

DecisionTreeClassifier(random_state=73)

In [796]:
y_pred = tree_class.predict(X_test_scaled)

print_actual_vs_predictions(y_test, y_pred)

['Actual', 'Predictions']
[[0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


In [797]:
tree_class_result = model_evaluation('Decision Tree Classification', y_test, y_pred)

for i in range(len(tree_class_result)):
    if (labels[i] == "Confusion Matrix"):
        print(labels[i], ':\n', tree_class_result[i])
    else:
        print(labels[i], ':', tree_class_result[i])

Model : Decision Tree Classification
Accuracy : 0.999348109517601
Confusion Matrix :
 [[864   1]
 [  0 669]]


So close to 0% misclassification rate. Missing the mark by 1 wrong prediction. Let's try Random Forest Classification and see if we can get 0%.

## Random Forest Classification

In [798]:
from sklearn.ensemble import RandomForestClassifier

forest_class = RandomForestClassifier(n_estimators=50, random_state=73)
forest_class.fit(X_train_scaled, y_train)

RandomForestClassifier(n_estimators=50, random_state=73)

In [799]:
y_pred = forest_class.predict(X_test_scaled)

print_actual_vs_predictions(y_test, y_pred)

['Actual', 'Predictions']
[[0 0]
 [0 0]
 [1 1]
 [1 1]
 [0 0]
 [1 1]
 [1 1]
 [0 0]]


In [800]:
forest_class_result = model_evaluation('Random Forest Classification', y_test, y_pred)

for i in range(len(forest_class_result)):
    if (labels[i] == "Confusion Matrix"):
        print(labels[i], ':\n', forest_class_result[i])
    else:
        print(labels[i], ':', forest_class_result[i])

Model : Random Forest Classification
Accuracy : 0.999348109517601
Confusion Matrix :
 [[865   0]
 [  1 668]]


Again missing the mark by just 1 wrong prediction. This time in False Negative instead of False Positive section. Let's try a different model and get the 0% misclassification rate.