In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

#### Dataset features
buying price: v-high, high, med, low

maintenance price: v-high, high, med, low

number of doors: 2, 3, 4, 5-more

person capacity: 2, 4, more

luggage boot: small, med, big

safety: low, med, high

acceptability: unacc, acc, good, v-good

In [2]:
cars_data = pd.read_csv("cars.csv")
print('Number of cars in the dataset: {}'.format(cars_data.shape[0]))

Number of cars in the dataset: 1728


In [3]:
cars_data.head()

Unnamed: 0,buying price,maintenance price,number of doors,person capacity,luggage boot,safety,acceptability
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


# 1. Initial Analysis of the Data

In [4]:
# checking if data contains any Null values
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   buying price       1728 non-null   object
 1   maintenance price  1728 non-null   object
 2   number of doors    1728 non-null   object
 3   person capacity    1728 non-null   object
 4   luggage boot       1728 non-null   object
 5   safety             1728 non-null   object
 6   acceptability      1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


**Remark**: There are no Null values in the dataset!. All columns are object data types, we could convert them to categorical data indicating their true data types, to save memory, but this is a small dataset so it does not seem necessary.

In [5]:
# Some statistics about the dataset
cars_data.describe()

Unnamed: 0,buying price,maintenance price,number of doors,person capacity,luggage boot,safety,acceptability
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,low,low,4,more,small,low,unacc
freq,432,432,432,576,576,576,1210


**Remark:** It seems that almost third of the cars in the dataset are unacceptable. 
Let's plot counts for each value in the acceptability field, to see how many cars are unacceptable, acceptable, good and very good.

In [6]:
# Get the count for each value of the acceptability field
cars_data['acceptability'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: acceptability, dtype: int64

In [None]:
!pip install psutil

In [7]:
import plotly.graph_objects as go

labels = ['unacc','acc','good','vgood']
values = [1210, 384, 69, 65]
# plotting the number of cars on each acceptability category
fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
fig.show("png")

ValueError: Image generation requires the psutil package.

Install using pip:
    $ pip install psutil

Install using conda:
    $ conda install psutil


**Remark:** 70% of the cars in the dataset are unacceptable, while good and very good classes are each less than 4% of the data. 

This renders the dataset as a very unbalanced one, which we should take into account when providing a solution using Machine Learning algorithms. 

In [None]:
# Extracting cars from each value of acceptability to analyse the other features of these cars
acc_cars = cars_data.loc[cars_data['acceptability'] == 'acc']
unacc_cars = cars_data.loc[cars_data['acceptability'] == 'unacc']
good_cars = cars_data.loc[cars_data['acceptability'] == 'good']
vgood_cars = cars_data.loc[cars_data['acceptability'] == 'vgood']

In [None]:
# function to plot graph of features
def stacked_bar_graph(title, data):
    labels = ["buying price","maintenance price","number of doors","person capacity","luggage boot","safety"]
    fig1 = go.Figure(
        data = [
            go.Bar(name="lowest value", x=labels, y=data["model_1"], marker={'color': 'red'}, text=data["model_1"],textposition='auto'),
            go.Bar(name="medium value", x=labels, y=data["model_2"], marker={'color': 'lightsalmon'}, text=data["model_2"],textposition='auto'),
            go.Bar(name="high value", x=labels, y=data["model_3"], marker={'color': 'lime'}, text=data["model_3"],textposition='auto'),
            go.Bar(name="very high value", x=labels, y=data["model_4"], marker={'color': 'limegreen'}, text=data["model_4"],textposition='auto')
        ],
        layout=go.Layout(title=title, yaxis_title="Feature's value")
    )
    fig1.update_layout(barmode='stack')
    fig1.show()

In [None]:
# function to get the value counts of each feature for differet values of car acceptability
def graph_data_prep(cars_acceptability_filtered):
    buy = cars_acceptability_filtered['buying price'].value_counts()
    maintenance = cars_acceptability_filtered['maintenance price'].value_counts()
    doors = cars_acceptability_filtered['number of doors'].value_counts()
    person = cars_acceptability_filtered['person capacity'].value_counts()
    luggage = cars_acceptability_filtered['luggage boot'].value_counts()
    safety = cars_acceptability_filtered['safety'].value_counts()
    
    data = {
        "model_1": [buy['low'], maintenance['low'], doors['2'], person.get('2', 0), luggage.get('small',0), safety.get('low', 0)],
        "model_2": [buy['med'], maintenance['med'], doors['3'], person['4'], luggage.get('med',0), safety.get('med',0)],
        "model_3": [buy.get('high',0), maintenance.get('high',0), doors['4'], person['more'], luggage['big'], safety.get('high',0)],
        "model_4": [buy.get('vhigh',0), maintenance.get('vhigh',0), doors['5more'], 0, 0, 0],
    }
    return data    

## 1.1 Feature Analysis
Below are 4 different graphs showing features for each 4 acceptability category of cars (unacceptable, acceptable, good, very good).

Fixing the acceptability value enables us to see which features play a significant role on the cars acceptability. 

### 1.1.1 Unacceptable Cars
#### Analyses of cars' features when the cars' acceptability is Unacceptable (unacc)

In [None]:
unacc_data = graph_data_prep(unacc_cars)
stacked_bar_graph("Unacceptable Cars' Features",unacc_data)

**Remarks:**

Number of unacceptable cars: 1210

From the above plot we can see that:
* Unacceptable cars have almost equally all values of 'buying price' and 'maintenance price', with a slightly more unacceptable cars having higher value prices.
* More unacceptable cars have the lowest value (2 doors) for the number of doors. However, there are cars with more doors that are also unacceptable.
* Almost half of the unacceptable cars have the lowest person capacity (2 person). So, person capacity seems to be an important feature for categorizing the car as unacceptable. Similarly is the 'safety' features values.
* More unacceptable cars have the lowest value of 'Luggage boot', but not a higher significant amount from the other values. 

**Note**: Buying price, maintenance price and number of doors can have 4 different values. Whereas person capacity, luggage boot and safety can have 3 different values (there is no very high value).

### 1.1.2 Acceptable Cars
#### Analyses of cars' features when the cars' acceptability is Acceptable (acc)

In [None]:
acc_data = graph_data_prep(acc_cars)
stacked_bar_graph("Acceptable Cars' Features",acc_data)

**Remarks:** 

Number of acceptable cars: 384

From the above plot we can see that
* Acceptable cars have all values of buying prices (same thing is for the maintenance price and number of doors)
* There are no acceptable cars with the lowest value of person capacity, so we can say that it is important to not have the lowest person capacity for the car to be acceptable
* There are not acceptable cars that have the lowest value of safety, so we can say that it is important to not have the lowest safety for the car to be acceptable

### 1.1.3 Good Cars
#### Analyses of cars' features when the cars' acceptability is Good (good)

In [None]:
good_data = graph_data_prep(good_cars)
stacked_bar_graph("Good Cars' Features",good_data)

**Remarks:** 

Number of cars with acceptability 'Good': 69

From the above plot we can see that
* There are no 'Good' cars that have 'high' or 'vhigh' buying price as well as maintenance price. So having medium and low values for the prices impacts the classification of cars to have 'good' acceptability value.
* There are no 'Good' cars that have the lowest 'person capacity'. Same is for the 'safety'. So, having medium to high values of 'person capacity' and 'safety' play a significant role in a car being classified as 'good'.
* 'Number of doors' and 'Luggage boot' does not seem to play a significant role for cars to be classified as 'good'. There is slightly less 'good' cars having the lowest values for these two features.

### 1.1.3 Very Good Cars
#### Analyses of cars' features when the cars' acceptability is Very Good (vgood)

In [None]:
vgood_data = graph_data_prep(vgood_cars)
stacked_bar_graph("Very Good Cars' Features",vgood_data)

**Remarks:** 

Number of cars with acceptability very good 'v-good': 65

From the above plot we can see that
* Cars categorized with 'v-good' acceptability have low and medium values of 'buying price' and **no 'v-good' cars have high or very high 'buying price'**. Similar is the 'maintenance price' however, there are some cars that have high value of 'maintenance price' but there is no cars that have 'v-high' maintenance price.
* **Safetly is an important feature** for cars with 'v-good' acceptability, since all of them have the highest safety value. No car with lower values of safety is categorized with 'v-good' acceptability.
* There are no 'v-good' cars that have the lowest values of 'person capacity' or 'luggage boot'.

## 1.2 Further Analysis
From the above analysis and insights we can see that **'safety'**, **'person capacity'**,  and **'buying price'** have the most significance to find the cars' acceptability. With **'person capacity'** being the next important feature, while **'number of doors'** does not seem to have much significance.

We can further analyse each feature and its significance with the acceptability of the cars using the **crosstab function**.

#### Safety and Car Acceptability

In [None]:
safety = pd.crosstab(cars_data['safety'], cars_data['acceptability']).reindex(['unacc','good', 'vgood','acc'], axis=1)
safety = safety.reindex(['low','med','high'], axis=0)
safety

As seen from the analysis and charts above safety plays a significant role on classifiying cars acceptability. **Cars with low safety are considered less acceptable than those with higher safety.**

#### Person-Capacity and Car Acceptability

In [None]:
person_capacity = pd.crosstab(cars_data['person capacity'], cars_data['acceptability']).reindex(['unacc','good', 'vgood','acc'], axis=1)
person_capacity = person_capacity.reindex(['2','4','more'], axis=0)
person_capacity

Again as seen from the analysis and charts above person capacity plays a significant role on classifiying cars acceptability. **Cars with less person capacity are considered less acceptable than those with higher safety.**

#### Buying price and Car Acceptability

In [None]:
buying_price = pd.crosstab(cars_data['buying price'], cars_data['acceptability']).reindex(['unacc','good', 'vgood','acc'], axis=1)
buying_price = buying_price.reindex(['low','med', 'high','vhigh'], axis=0)
buying_price

In [None]:
number_doors = pd.crosstab(cars_data['number of doors'], cars_data['acceptability'])
number_doors = number_doors.reindex(['2','3', '4','5more'], axis=0)
number_doors

Here, same as the above anaysis, it is observed that the number of doors does not play a significant role in determining the acceptability of the cars.

# 2. Machine Learning

The data we have contains labels for the target variable thus this is a supervised machine learning problem. Also, since we have labels and known values of the target variable then we can consider this as a **classification problem**.


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

In [None]:
cars_data_replace = cars_data.copy()

## 2.1 Data Preparation
### Encoding the categorical values 
Firstly, all the variables values in our dataset are categorical data which are not what a machine laerning algorithm expects. Therefore, we need to convert them to numerical data.

All the variables' values have a meaningful order which makes them ordinal categorical variables. Therefore, we need to encode them in such a way as to retain the ordinal nature of the variables. To do that we need to perform **ordinal encoding**.

In [None]:
# to retain the ordinal nature of the variable we need to have a mapping
# of the categories with the numerical values
encoding_values = {'buying price': {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1},
                   'maintenance price': {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1},
                   'number of doors': {'5more': 4, '4': 3, '3': 2, '2': 1},
                   'person capacity': {'more': 3, '4': 2, '2': 1},
                   'luggage boot': {'big': 3, 'med': 2, 'small': 1},
                   'safety': {'high': 3, 'med': 2, 'low': 1},
                   'acceptability': {'vgood': 4, 'good': 3, 'acc': 2, 'unacc': 1}}

In [None]:
cars_data_replace.replace(encoding_values, inplace=True)
cars_data_replace.head()

### Creating the Test and Training data sets

In order to train the model we need to prepare the training set, here I take 75% of the whole dataset and use it for training. The rest of the data (25%) will be used for testing the model.

In [None]:
cars_y = cars_data_replace.iloc[:,-1]
cars_x = cars_data_replace.iloc[:, :-1]
# splitting data into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(cars_x, cars_y, test_size=0.25, random_state=0)

In [None]:
y_train.describe()

## 2.2 ML Algorithm 

The problem we are trying to solve is a classification problem, specifically supervised classification. Some of the machine learning algorithms that are used for solving this type of problems are Support Vector Machines (SVM), K-nearest neighbor, Naive Bayes, Decision Trees, etc.

Considering that we have ordinal categorical features, and a multi class classification problem, I am approaching the solution to this problem through a Decision Tree model and a SVM model. 

### Decision Tree Classifier
Given that the data contains all ordinal categorical attributes, then using a **Decision Tree Algorithm** seems to be the right choice since this algorithm is suitable for use when feature values are categorical.
Additionally, it is suitable given that some attributes, such as safety, as seen from the analyses above play a more significant role on the cars classification than others, and knowing that this is how decision trees work, by selecting more significant features to split the dataset, makes this a suitable ML algorithm.

Since 70% of the data belongs to one target class (unacceptable) it makes up for very unbalanced classes in the dataset. This can lead to a biased tree since the frequently occured classes are prefered over the less frequently occuring ones. Specifying class weight to give more importance to the less represented classes can help give the unbalanced classes a similar role in the purity of nodes. We can do this by setting the *class_weight* to 'balanced'.

#### Parameter tuning and cross validation
Parameters of Decision Tree that we are going to optimize for here are *max_depth* which signifies the depth of the tree, and the criterion which the algorithm will use to split the data into subsets (sub-nodes) while creating the tree.

I will use *GridSearchCV* functionality of sklearn to perform cross-validation and parameter tuning to find the best parameters on which we can train the Decision Tree.

In [None]:
# grid of paramters to choose from
param_grid = { 'criterion':['gini','entropy'],'max_depth': np.arange(3, 20)}
DT_classifier = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid, cv=10)
DT_classifier.fit(X_train, y_train)
tree_model = DT_classifier.best_estimator_
print (DT_classifier.best_params_) 

In [None]:
# create and train the model
DT_classifier = DecisionTreeClassifier(criterion = 'gini',class_weight = 'balanced', max_depth=11, random_state = 0)
DT_classifier.fit(X_train, y_train)

**Evaluating the Decision Tree**

To assess the performace of the Desision Tree model we check for the accuray, precision, recall, f1 score, as well as the confusion matrix.

In [None]:
# make predictions of the test set
y_pred = DT_classifier.predict(X_test)
# calculate different metrics to evaluate the model
DT_classifier_f1 = f1_score(y_test,y_pred, average='macro')
print(classification_report(y_test,y_pred))
cm = confusion_matrix(y_test, y_pred)
print("Training Accuracy: ",DT_classifier.score(X_train, y_train))
print("Testing Accuracy: ", DT_classifier.score(X_test, y_test))
print("Confusion Matrix: \n",cm)

Here we can see that the accuracy is really high, but accuracy is not a good measure for our model given that our dataset is very unbalanced, and the algorithm is designed to maximize for accuracy and reduce error. For better evaluation of our model we can look at the precision, recall, f1 score and confusion matrix since they give better insights. Therefore, the precision and recall seem to be very high for all 4 target classes. Additionally, the confusion matrix shows an almost perfect diagonal (perfect classifier) which indicates that the model is performing well. 

### SVM Model

Another approach to classify cars in this dataset is using Support Vector Machines (SVMs). SVMs try to find a hyperplane in an n-dimensional space that separates the data points to their potential classes.

##### Parameter Tuning
SVM parameters that need optimizing are **C parameter** which adds a penalty for each misclassified data point, and **Gamma parameter** which controls the distance of influence of a single training point. Again as above, we can find the best parameters utilizing the *GridSearchCV* functionality of sklearn.

Since I will be evaluating SVM using the rbf kernel function, I am applying scaling so that the features fall in the same range.

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
# scaling the data
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# parameter tuning: function to find the best SVM parameters and kernel
def best_svm_parameters():
    print("Tuning hyper-parameters...")
    # scores to use to evalute while finding the best parameters
    scores = ['precision','recall']
    # parameters for which to tune and their grid values
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], 
                         'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                        {'kernel': ['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
                         'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]},
                        {'kernel': ['linear'], 'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}
                       ]
    for score in scores:
        grid_search = GridSearchCV(SVC(C=1), tuned_parameters, cv=10, scoring='%s_macro' % score)
        grid_search.fit(X_train, y_train)
    print("Best parameters for SVM are:")
    return grid_search.best_params_

In [None]:
best_svm_parameters()

Let's now train the SVM model with the best hyper-parameters.

In [None]:
# SVM using the radial basis as kernel function with the best C and Gamma parameters
rbf_svm = SVC(C=1000, gamma=0.01, kernel = 'rbf', random_state = 0)
rbf_svm.fit(X_train, y_train)

To evaluate the SVM model we can check the values for the accuracy, precision, recall, f1 score as well as the confusion matrix which shows the true classified cases and the mistakes that the model has made on the testing set.

In [None]:
# function to evaluate the SVM by calculating the precision, recall, f1 score and the confusion matrix
def evaluating_svm(clf):
    y_pred = clf.predict(X_test)
    f1_svm_linear=f1_score(y_test,y_pred, average='macro')
    print(classification_report(y_test,y_pred))
    print("Training Accuracy: ",clf.score(X_train, y_train))
    print("Testing Accuracy: ", clf.score(X_test, y_test))
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix: \n", cm)

In [None]:
evaluating_svm(rbf_svm)

The confusion matrix shows that the model is performing really good, since there are 17 data points where the model has made mistakes in classfying them. The diagonal of the matrix represents the number of data points for which the predicted class is equal to the true class, and that is where almost all the data points are shows, thus the model's performance seems to be really high. 

### Comparing the Decision Tree Classifier with the SVM Classifier

For a better insight and comparison between the two model we can compare their confusion matrices. We can see that the Decision Tree model makes slightly less mistakes on the test set compared to the SVM, which makes it a slightly better model. However, both model's performace seems to be very good and one can choose between the two. 

I will save both classifiers utilize it for future predictions and deployment. 

In [None]:
import pickle
# Save SVM classifier 
filename = 'svm_model.sav'
pickle.dump(rbf_svm, open(filename, 'wb'))

In [None]:
# Save Decision Tree classifier
filename = 'dt_model.sav'
pickle.dump(DT_classifier, open(filename, 'wb'))

**The trained models can be used to make predictions. To see how to use them follow the steps in Readme file.**