# Data Pre-processing

#### reading data in the inputDataFrame
providing file path of car dataset and reading those data in the Pandas dataframe,
here, if there are null values, they will be converted into '?",
we have provided header name list

In [2]:
import pandas as pd
inputFilePath = 'https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data'
inputDataFrame = pd.read_csv(filepath_or_buffer = inputFilePath, na_values='?', skipinitialspace=True, 
                             names  = ['buying', 'maint', 'doors','persons','lug_boot','safety', 'class'])

In [3]:
inputDataFrame.head()


Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


#### removing null values

In [4]:
nullRemovedDataFrame = inputDataFrame.dropna()

#### convert categorical data into numerical data

In [5]:
#creating a data frame which contains the object type values
objDataFrame = nullRemovedDataFrame.select_dtypes(include=['object']).copy()
nRows, nCols = objDataFrame.shape
#for all column converting object type in categoty type and assigning
#appropriate code
for myIndex in range(0,nCols):
    headerName = objDataFrame.columns[myIndex]
    objDataFrame[headerName] = objDataFrame[headerName].astype("category")
    objDataFrame[headerName] = objDataFrame[headerName].cat.codes
    #writing objectDataFrame column to its respective dataFrame column
    nullRemovedDataFrame[headerName] = objDataFrame[headerName]            
#for myIndex -ends
numericDataFrame = nullRemovedDataFrame


In [6]:
numericDataFrame.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2


In [7]:
y = numericDataFrame[['class']]

In [8]:
y.head()

Unnamed: 0,class
0,2
1,2
2,2
3,2
4,2


#### Scale the data:      
scaling data using MinMaxScaler

In [9]:
import numpy as np
#finding shape of dataframe
nRows, nCols = numericDataFrame.shape

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numericDataFrame[numericDataFrame.columns] = scaler.fit_transform(numericDataFrame[numericDataFrame.columns])
scaledDataFrame = numericDataFrame

In [10]:
x= scaledDataFrame[['buying', 'maint', 'doors','persons','lug_boot','safety']]
x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1.0,1.0,0.0,0.0,1.0,0.5
1,1.0,1.0,0.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,0.0,0.5,0.5
4,1.0,1.0,0.0,0.0,0.5,1.0


In [11]:
x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,1.0,1.0,0.0,0.0,1.0,0.5
1,1.0,1.0,0.0,0.0,1.0,1.0
2,1.0,1.0,0.0,0.0,1.0,0.0
3,1.0,1.0,0.0,0.0,0.5,0.5
4,1.0,1.0,0.0,0.0,0.5,1.0


In [12]:
y.head()

Unnamed: 0,class
0,2
1,2
2,2
3,2
4,2


splitting data into training and testing - currently fitting data only on the training data to adjust the hyper parameters

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train, y_test = train_test_split(x, y, test_size = 0, random_state=10)

## Decision Tree

In [14]:
from sklearn.tree import DecisionTreeClassifier

Playing with hyper parameters of Decision tree

In [15]:
clf_decisionTree = DecisionTreeClassifier(criterion= "entropy", random_state=100, min_samples_leaf=10)

In [16]:
clf_decisionTree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=100,
            splitter='best')

In [17]:
predict_decisionTree = clf_decisionTree.predict(x_train)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix
print (confusion_matrix(y_train, predict_decisionTree))
print (classification_report(y_train, predict_decisionTree))

[[ 354   12   15    3]
 [   6   60    3    0]
 [  24    0 1186    0]
 [   0    4    0   61]]
             precision    recall  f1-score   support

          0       0.92      0.92      0.92       384
          1       0.79      0.87      0.83        69
          2       0.99      0.98      0.98      1210
          3       0.95      0.94      0.95        65

avg / total       0.96      0.96      0.96      1728



## Neural Network:

In [19]:
import copy
x_neural_net_train=x_train.copy()
y_neural_net_train=y_train.copy()


In [20]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(7,12,10), random_state=10, max_iter=1000)

y_neural_net_train=y_neural_net_train.values.reshape((1728,))
clf.fit(x_neural_net_train,y_neural_net_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(7, 12, 10), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=10, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [21]:
predictions = clf.predict(x_neural_net_train)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_neural_net_train,predictions))
print(classification_report(y_neural_net_train,predictions))

[[ 376    2    6    0]
 [   1   68    0    0]
 [   8    0 1202    0]
 [   0    2    0   63]]
             precision    recall  f1-score   support

          0       0.98      0.98      0.98       384
          1       0.94      0.99      0.96        69
          2       1.00      0.99      0.99      1210
          3       1.00      0.97      0.98        65

avg / total       0.99      0.99      0.99      1728



# Support Vector Machine

In [22]:
import copy
x_svm_train=x_train.copy()
y_svm_train=y_train.copy()
y_svm_train=y_svm_train.values.reshape((1728,))


In [23]:
from sklearn.svm import SVC
clf = SVC(kernel='rbf',random_state=20,C=20)
clf.fit(x_svm_train, y_svm_train) 

SVC(C=20, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=20, shrinking=True,
  tol=0.001, verbose=False)

In [24]:
predictions = clf.predict(x_svm_train)
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_svm_train,predictions))
print(classification_report(y_svm_train,predictions))

[[ 357   13   11    3]
 [  15   51    0    3]
 [  24    0 1186    0]
 [   3    0    0   62]]
             precision    recall  f1-score   support

          0       0.89      0.93      0.91       384
          1       0.80      0.74      0.77        69
          2       0.99      0.98      0.99      1210
          3       0.91      0.95      0.93        65

avg / total       0.96      0.96      0.96      1728



## Multinomial Naive Bayes

In [55]:
mlNb_x_train = x_train.copy()
mlNb_y_train = y_train.copy()
row, col = mlNb_y_train.shape
mlNb_y_train = mlNb_y_train.values.reshape((row,))

In [81]:
from sklearn.naive_bayes import MultinomialNB
mlNb_clf = MultinomialNB(alpha = 1, fit_prior=False, class_prior = None)
mlNb_clf.fit(mlNb_x_train, mlNb_y_train)

MultinomialNB(alpha=1, class_prior=None, fit_prior=False)

In [87]:
predictions = mlNb_clf.predict(mlNb_x_train)
accuracy = mlNb_clf.score(mlNb_x_train, mlNb_y_train)
print ("accuracy = {} %\n".format(accuracy*100))
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(mlNb_y_train,predictions))
print(classification_report(mlNb_y_train,predictions))

accuracy = 46.585648148148145 %

[[ 23 130  70 161]
 [  4  35  12  18]
 [169 226 682 133]
 [  0   0   0  65]]
             precision    recall  f1-score   support

          0       0.12      0.06      0.08       384
          1       0.09      0.51      0.15        69
          2       0.89      0.56      0.69      1210
          3       0.17      1.00      0.29        65

avg / total       0.66      0.47      0.52      1728



## Logistic Regression 

In [122]:
lr_x_train = x_train.copy()
lr_y_train = y_train.copy()
row, col = lr_y_train.shape
lr_y_train = lr_y_train.values.reshape((row,))

In [207]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(C=1e-2,penalty='l2', random_state = None, solver='lbfgs', warm_start=True, fit_intercept=False)
lr_clf.fit(lr_x_train, lr_y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='lbfgs', tol=0.0001,
          verbose=0, warm_start=True)

In [208]:
predictions = lr_clf.predict(lr_x_train)
accuracy = lr_clf.score(lr_x_train, lr_y_train)
print ("accuracy = {} %\n".format(accuracy*100))
from sklearn.metrics import confusion_matrix, classification_report
print (confusion_matrix(lr_y_train, predictions))
print (classification_report(lr_y_train, predictions))

accuracy = 70.89120370370371 %

[[  21    0  363    0]
 [   0    0   69    0]
 [   6    0 1204    0]
 [   7    0   58    0]]
             precision    recall  f1-score   support

          0       0.62      0.05      0.10       384
          1       0.00      0.00      0.00        69
          2       0.71      1.00      0.83      1210
          3       0.00      0.00      0.00        65

avg / total       0.63      0.71      0.60      1728



  'precision', 'predicted', average, warn_for)
