In [164]:
# importing required libraries
import pandas as pd
import numpy as np

In [165]:
Train_Data = pd.read_csv("Datasets\CrashTest_TrainData.csv") # read the train dataset

In [166]:
Test_Data = pd.read_csv("Datasets\CrashTest_TestData.csv")  # read the test dataset

In [167]:
Train_Data.isnull().sum()  # 15Q) missing values in Train_Data

CarID      0
ManHI      0
ManBI      0
IntI       3
HVACi      0
Safety     0
CarType    0
dtype: int64

In [168]:
# third quartile values of the variable ManBI in Train Data
Train_Q3 = np.percentile(Train_Data.ManBI, 75)  # Q3
Train_Q3

3.4175

In [169]:
# third quartile values of the variable ManBI in Test Data
Test_Q3 = np.percentile(Test_Data.ManBI, 75)  # Q3
Test_Q3

2.5

In [170]:
# 13Q) difference between third quartile values of the variable ManBI from Train_Data and Test_Data
diff_Train_Test_Q3 = Train_Q3 - Test_Q3
diff_Train_Test_Q3

0.9175

In [171]:
Train_Data[Train_Data.isnull().any(axis=1)] # 15Q) missing values are there in Train_Data

Unnamed: 0,CarID,ManHI,ManBI,IntI,HVACi,Safety,CarType
14,15,6.81,3.66,,0.89,-2.41,Hatchback
21,22,2.03,-5.81,,-6.47,9.79,Hatchback
31,32,-0.68,-1.66,,7.02,-8.07,Hatchback


In [172]:
Train_Data2 = Train_Data.dropna()  # Drop the missing values

In [173]:
Train_Data2.shape

(77, 7)

In [174]:
Train_Data2.isnull().sum()

CarID      0
ManHI      0
ManBI      0
IntI       0
HVACi      0
Safety     0
CarType    0
dtype: int64

In [175]:
np.unique(Train_Data2['CarType']) #14Q) distinct car types are there in the Train_Data

array(['Hatchback', 'SUV'], dtype=object)

In [176]:
Train_Data2['CarType'].value_counts()

Hatchback    47
SUV          30
Name: CarType, dtype: int64

In [177]:
Train_Data2.dtypes  # Map the categorical variables into integers

CarID        int64
ManHI      float64
ManBI      float64
IntI       float64
HVACi      float64
Safety     float64
CarType     object
dtype: object

In [133]:
# Reindexing the CarType status names to 0,1
Train_Data2['CarType'] = Train_Data2['CarType'].map({'SUV' : 0, 'Hatchback':1})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [134]:
np.unique(Train_Data2['CarType'])

array([0, 1], dtype=int64)

In [135]:
# 16Q)  proportion of car types in the Test_Data
prop_cartype = pd.crosstab(index = Test_Data["CarType"],
                  columns = 'count',
                  normalize = True)
prop_cartype

col_0,count
CarType,Unnamed: 1_level_1
Hatchback,0.5
SUV,0.5


In [136]:
Test_Data['CarType'] = Test_Data['CarType'].map({'SUV' : 0, 'Hatchback':1})

In [137]:
Test_Data.dtypes  # to check all column data types

CarID        int64
ManHI      float64
ManBI      float64
IntI       float64
HVACi      float64
Safety     float64
CarType      int64
dtype: object

In [138]:
# target variable (output variable)  - CarType
# seperate the independent (input variables) and target variable on training data and test data
train_y = Train_Data2['CarType']
train_x = Train_Data2.drop(['CarType'], axis=1, inplace=True)

test_y = Test_Data['CarType']
test_x = Test_Data.drop(['CarType'], axis=1, inplace=True)

In [139]:
# remove sequence ID value ( CarID) in both train and test data . 
# By removing this will get better accuracy
train_x = Train_Data2.drop(columns = ['CarID'], axis = 1)
test_x = Test_Data.drop(columns = ['CarID'], axis = 1)

In [140]:
test_x.head()

Unnamed: 0,ManHI,ManBI,IntI,HVACi,Safety
0,1.94,2.21,3.38,1.78,-7.19
1,-0.02,-3.33,0.79,-6.63,7.99
2,-0.49,-4.48,5.0,8.33,-2.77
3,5.76,1.35,7.92,-0.43,4.29
4,2.51,-8.74,4.53,-1.91,3.95


In [141]:
train_x.dtypes

ManHI     float64
ManBI     float64
IntI      float64
HVACi     float64
Safety    float64
dtype: object

In [142]:
#importing the library of KNN
from sklearn.neighbors import KNeighborsClassifier

In [143]:
# Storing the K nearest neighbors classifier
KNN_classifier_K3 = KNeighborsClassifier(n_neighbors = 3)

In [144]:
# Fitting the values for x and y
KNN_classifier_K3.fit(train_x, train_y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [145]:
# Predicting the test values with model
prediction_k3 = KNN_classifier_K3.predict(test_x)

In [146]:
from sklearn.metrics import accuracy_score

In [147]:
# Calculating the accuracy 
accuracy_score_k3 = accuracy_score(test_y, prediction_k3)

In [148]:
# 17Q) accuracy score of the K-Nearest Neighbor model (model_1) with 3 neighbors using Train_Data and Test_Data
print(accuracy_score_k3)  

0.7


In [152]:
y_test = np.asarray(test_y)
misclassified_y_test = np.where(y_test != KNN_classifier_K3.predict(test_x))

In [153]:
# 18Q ) list of indices of misclassified samples from the ‘model_1’.
misclassified_y_test

(array([ 0,  1,  7,  8, 10, 14], dtype=int64),)

In [154]:
KNN_classifier_K2 = KNeighborsClassifier(n_neighbors = 2)
KNN_classifier_K2.fit(train_x, train_y)
prediction_k2 = KNN_classifier_K2.predict(test_x)
accuracy_score_k2 = accuracy_score(test_y, prediction_k2)
print(accuracy_score_k2) 

0.85


In [157]:
# 19Q) Compare results of the two models (model_1 & model_2).
print(accuracy_score_k2 > accuracy_score_k3) 

True


In [158]:
from sklearn.linear_model import LogisticRegression

In [159]:
logistic = LogisticRegression()

In [160]:
logistic.fit(train_x,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [161]:
logistic.coef_
logistic.intercept_
prediction = logistic.predict(test_x)

In [162]:
accuracy_score_log = accuracy_score(test_y,prediction)

In [163]:
print(accuracy_score_log)  # 20Q Logistic Accuracy score

1.0
