## Install libraries/packages

In [None]:
!pip install pandas
!pip install scikit-learn
!pip install matplotlib
!pip install plotly
!pip install seaborn
!pip install numpy

## Import libraries/packages

In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

## Load data

In [None]:
file = '../input/car.csv'
data = pd.read_csv(file)

Below we will start our exploratory data analysis for the specific data set.

## Exploratory data analysis

In [None]:
# Shape of the data set.

print("The data set has {} rows and {} columns.".format(data.shape[0],data.shape[1]))

Getting the first rows of the data set.

In [None]:
# Examine the first 5 rows of the data set.

data.head()

Get info of dataset and check for null values and duplicate rows if any.

In [None]:
# Information about the attributes and the data types. 

data.info()

In [None]:
# Check for missing values.

data.isna().any()

In [None]:
# Check for duplicate rows.

data.duplicated().any()

Since all the columns are categorical, we change the data types to "category". This will come in handy in case we want to sort any column of the data set.

In [None]:
# Convert all categorical values to category type.

buying_type = CategoricalDtype(['low','med','high','vhigh'], ordered=True)
maint_type = CategoricalDtype(['low','med','high','vhigh'], ordered=True)
doors_type = CategoricalDtype(['2','3','4','5more'], ordered=True)
persons_type = CategoricalDtype(['2','4','more'], ordered=True)
lug_boot_type = CategoricalDtype(['small','med','big'], ordered=True)
safety_type = CategoricalDtype(['low','med','high'], ordered=True)
class_type = CategoricalDtype(['unacc','acc','good','vgood'], ordered=True)

data.buying = data.buying.astype(buying_type)
data.maint = data.maint.astype(maint_type)
data.doors = data.doors.astype(doors_type)
data.persons = data.persons.astype(persons_type)
data.lug_boot = data.lug_boot.astype(lug_boot_type)
data.safety = data.safety.astype(safety_type)
data.class_val = data.class_val.astype(class_type)

### Value distribution for each attribute

In [None]:
# Checking the values for each category per column.

for i in data.columns:
    print(data[i].value_counts(),'\n')

In [None]:
# Plotting the values for each category per column.

for i in data.columns:
    labels = data[i].unique()
    values = data[i].value_counts()
    fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
    fig.update_layout(title=go.layout.Title(text='Value distribution for column {}'.format(i),xref="paper",x=.5))
    fig.show()

## Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()

In [None]:
# le=LabelEncoder()

# for i in test.columns:
#     test[i]=le.fit_transform(test[i])

In [None]:
# Unique values of each attribute.

buying_type = ['low','med','high','vhigh']
maint_type = ['low','med','high','vhigh']
doors_type = ['2','3','4','5more']
persons_type = ['2','4','more']
lug_boot_type = ['small','med','big']
safety_type = ['low','med','high']
class_type = ['unacc','acc','good','vgood']

In [None]:
# Encode values of "buying" to numbers.
enc.fit(buying_type)
data.buying=enc.transform(data.buying)

# Encode values of "maint" to numbers.
enc.fit(maint_type)
data.maint=enc.transform(data.maint)

# Encode values of "doors" to numbers.
enc.fit(doors_type)
data.doors=enc.transform(data.doors)

# Encode values of "persons" to numbers.
enc.fit(persons_type)
data.persons=enc.transform(data.persons)

# Encode values of "lug_boot" to numbers.
enc.fit(lug_boot_type)
data.lug_boot=enc.transform(data.lug_boot)

# Encode values of "safety" to numbers.
enc.fit(safety_type)
data.safety=enc.transform(data.safety)

# Encode values of "class values" to numbers.
enc.fit(class_type)
data.class_val=enc.transform(data.class_val)

In [None]:
data.head()

### Correlation heatmap

In [None]:
fig=plt.figure(figsize=(10,6))
sns.heatmap(data.corr(),annot=True,cmap='rainbow',linewidth=0.5);

### Splitting to X_train, X_test, y_train, y_test sets.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X=data[data.columns[:-1]]
y=data['class_val']

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Model Selection

We will try 3  models:
 - Logistic Regression
 - Decision trees
 - Neural network

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, validation_curve, learning_curve
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
logreg=LogisticRegression(solver='saga',multi_class='multinomial',random_state=42)

In [None]:
logreg.fit(X_train,y_train)

In [None]:
log_pred=logreg.predict(X_test)

In [None]:
logreg.score(X_test,y_test)

In [None]:
lc=learning_curve(logreg,X_train,y_train,cv=10,n_jobs=-1)
size=lc[0]
train_score=[lc[1][i].mean() for i in range (0,5)]
test_score=[lc[2][i].mean() for i in range (0,5)]
fig = go.Figure()
fig.add_trace(go.Scatter(x=size, y=train_score,mode='lines+markers',name='train_score'))
fig.add_trace(go.Scatter(x=size, y=test_score,mode='lines+markers',name='test_score'))
fig.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import metrics

# The mean squared error (relative error).
print("Mean squared error: %.2f" % mean_squared_error(y_test, log_pred))

# Explained average absolute error (average error).
print("Average absolute error: %.2f" % mean_absolute_error(y_test, log_pred))

# Explained variance score: 1 is perfect prediction.
print('Variance score: %.2f' % logreg.score(X_test, y_test))

### Decision trees

In [None]:
# Import decision tree classifier module.
from sklearn import tree

# Instantiate decision tree classifier.
clf = tree.DecisionTreeClassifier(max_depth=5, criterion='entropy', random_state=42) # can be 'gini' or 'entropy'

In [None]:
# Train the classifier.
clf = clf.fit(X_train, y_train)

In [None]:
max_depth=12
scores=[]
for d in range(1,max_depth):
    clf = tree.DecisionTreeClassifier(max_depth=d, criterion='entropy', random_state=42) # can be 'gini' or 'entropy'
    clf = clf.fit(X_train, y_train)
    tr_pred=clf.predict(X_test)
    scores.append(clf.score(X_test, y_test))

fig = go.Figure(data=go.Scatter(x=[d for d in range(1,max_depth)], y=scores))
fig.show()


In [None]:
# Plot the tree.

tree.plot_tree(clf);
plt.figure(figsize=(12,6))
# fig.set_size_inches(11,8)
# plt.figure(figsize=(20,20))
# fig.savefig('test2png.png', dpi=100)

In [None]:
# Create an image with the rules of the tree.
from graphviz import Source
from IPython.display import SVG

# Simpler black & white.
# graph=Source(tree.export_graphviz(clf, out_file=None, feature_names=X_train.columns))

# More info with colors.
graph=Source(tree.export_graphviz(clf,feature_names = X_train.columns, out_file=None, class_names = enc.classes_, filled=True, rounded=True,proportion=True, special_characters=True))
SVG(graph.pipe(format='svg'))

In [None]:
# Print the tree in a simplified version.
from sklearn.tree.export import export_text
r = export_text(clf, feature_names=X.columns.tolist())
print(r)

In [None]:
lc=learning_curve(clf,X_train,y_train,cv=10,n_jobs=-1)
size=lc[0]
train_score=[lc[1][i].mean() for i in range (0,5)]
test_score=[lc[2][i].mean() for i in range (0,5)]
fig = go.Figure()
fig.add_trace(go.Scatter(x=size, y=train_score,mode='lines+markers',name='train_score'))
fig.add_trace(go.Scatter(x=size, y=test_score,mode='lines+markers',name='test_score'))
fig.show()

In [None]:
tr_pred=clf.predict(X_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

# The mean squared error (relative error).
print("Mean squared error: %.2f" % mean_squared_error(y_test, tr_pred))

# Explained average absolute error (average error).
print("Average absolute error: %.2f" % mean_absolute_error(y_test, tr_pred))

# Explained variance score: 1 is perfect prediction.
print('Variance score: %.2f' % clf.score(X_test, y_test))

### Neural network

In [None]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(10, 10, 10), max_iter=1000, random_state=42)
mlp.fit(X_train, y_train.values.ravel())

In [None]:
lc=learning_curve(mlp,X_train,y_train,cv=10,n_jobs=-1)
size=lc[0]
train_score=[lc[1][i].mean() for i in range (0,5)]
test_score=[lc[2][i].mean() for i in range (0,5)]
fig = go.Figure()
fig.add_trace(go.Scatter(x=size, y=train_score,mode='lines+markers',name='train_score'))
fig.add_trace(go.Scatter(x=size, y=test_score,mode='lines+markers',name='test_score'))
fig.show()

In [None]:
mlp_pred = mlp.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,mlp_pred))
print(classification_report(y_test,mlp_pred))

In [None]:
# Plot confuaion matrix.
matrix = confusion_matrix(y_test,mlp_pred)
sns.heatmap(matrix,annot=True,cbar=False,cmap='rainbow',linewidth=0.5)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix');

In [None]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

# The mean squared error (relative error).
print("Mean squared error: %.2f" % mean_squared_error(y_test, mlp_pred))

# Explained average absolute error (average error).
print("Average absolute error: %.2f" % mean_absolute_error(y_test, mlp_pred))

# Explained variance score: 1 is perfect prediction.
print('Variance score: %.2f' % mlp.score(X_test, y_test))

In [None]:
# Ploting errors
errors=['mean_squared_error', 'mean_absolute_error']

fig = go.Figure(data=[
    go.Bar(name='Logistic Regression', x=errors, y=[mean_squared_error(y_test, log_pred), mean_absolute_error(y_test, log_pred)]),
    go.Bar(name='Decision tree', x=errors, y=[mean_squared_error(y_test, tr_pred), mean_absolute_error(y_test, tr_pred)]),
    go.Bar(name='Neural network', x=errors, y=[mean_squared_error(y_test, mlp_pred), mean_absolute_error(y_test, mlp_pred)])
])

fig.update_layout(
    title='Errors for each model',
    xaxis_tickfont_size=14,
    yaxis=dict(title='Error',titlefont_size=16,tickfont_size=14),    
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()

In [None]:
accuracy=['Accuracy']
fig = go.Figure(data=[
    go.Bar(name='Logistic Regression', y=[logreg.score(X_test, y_test)]),
    go.Bar(name='Decision tree', y=[clf.score(X_test, y_test)]),
    go.Bar(name='Neural network', y=[mlp.score(X_test, y_test)])
])

fig.update_layout(
    title='Accuracy for each model',
    xaxis_tickfont_size=14,
    yaxis=dict(title='Accuracy',titlefont_size=16,tickfont_size=14),    
    barmode='group',
    bargap=0.15, # gap between bars of adjacent location coordinates.
    bargroupgap=0.1 # gap between bars of the same location coordinate.
)
fig.show()