# importing libraries

In [14]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
import pickle
import os.path

# Reading Dataset

In [2]:
df = pd.read_csv('cleaned_dataset.csv')
print(df.shape)

(1000, 14)


# Showing describe

In [3]:
df.describe()

Unnamed: 0,Air Pollution,Alcohol use,Dust Allergy,OccuPational Hazards,Genetic Risk,chronic Lung Disease,Balanced Diet,Obesity,Smoking,Passive Smoker,Chest Pain,Coughing of Blood,Fatigue,Level
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,3.84,4.563,5.165,4.84,4.58,4.38,4.491,4.465,3.948,4.195,4.438,4.859,3.856,2.062
std,2.0304,2.620477,1.980833,2.107805,2.126999,1.848518,2.135528,2.124921,2.495902,2.311778,2.280209,2.427965,2.244616,0.815365
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2.0,2.0,4.0,3.0,2.0,3.0,2.0,3.0,2.0,2.0,2.0,3.0,2.0,1.0
50%,3.0,5.0,6.0,5.0,5.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,3.0,2.0
75%,6.0,7.0,7.0,7.0,7.0,6.0,7.0,7.0,7.0,7.0,7.0,7.0,5.0,3.0
max,8.0,8.0,8.0,8.0,7.0,7.0,7.0,7.0,8.0,8.0,9.0,9.0,9.0,3.0


# Investigating the greatest impact on classification

In [5]:
correlations = df.corrwith(df['Level'])
sorted_columns = correlations.sort_values(ascending=False)
print(sorted_columns)

Level                   1.000000
Obesity                 0.827435
Coughing of Blood       0.782092
Alcohol use             0.718710
Dust Allergy            0.713839
Balanced Diet           0.706273
Passive Smoker          0.703594
Genetic Risk            0.701303
OccuPational Hazards    0.673255
Chest Pain              0.645461
Air Pollution           0.636038
Fatigue                 0.625114
chronic Lung Disease    0.609971
Smoking                 0.519530
dtype: float64


# Separating the x's and y

In [6]:
x = df.iloc[: , :-1].values
y = df["Level"].values
print(x.shape)
print(y.shape)

(1000, 13)
(1000,)


# Data scaling

In [7]:
scaler = preprocessing.StandardScaler().fit(x)
x = scaler.transform(x.astype(float))
print(x)

[[-0.90667901 -0.21495391 -0.08333998 ... -1.069735   -0.35397126
  -0.38154776]
 [-0.41391868 -1.36035665 -0.08333998 ... -1.069735   -0.7660449
  -1.27301449]
 [ 0.07884165  0.166847    0.42175079 ... -0.19218373  1.29432331
   1.84711907]
 ...
 [ 0.07884165  0.166847    0.42175079 ... -0.19218373  1.29432331
   1.84711907]
 [ 1.06436231  1.31224973  0.92684156 ...  1.12414318  1.70639695
  -0.38154776]
 [ 1.06436231  0.166847    0.42175079 ... -0.19218373  1.29432331
   1.84711907]]


# Spliting the train and test data

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)
print ('Train set:', x_train.shape,  y_train.shape)
print ('Test set:', x_test.shape,  y_test.shape)

Train set: (800, 13) (800,)
Test set: (200, 13) (200,)


# Building the model

In [9]:
k = 4
#Train Model and Predict
model = KNeighborsClassifier(n_neighbors = k).fit(x_train,y_train)
model

# Check the model accuracy

In [10]:
print("Train set Accuracy: ", metrics.accuracy_score(y_train , model.predict(x_train)))
print("Test set Accuracy: " , metrics.accuracy_score(y_test  , model.predict(x_test)))

Train set Accuracy:  1.0
Test set Accuracy:  1.0


# Saving the model

In [15]:
path = './knnmodel_file'

check_file = os.path.isfile(path)
if check_file == False:
    
    # Its important to use binary mode
    knnPickle = open('knnmodel_file', 'wb')

    # source, destination
    pickle.dump(model, knnPickle)

    # close the file
    knnPickle.close()
else:
    print("model already exist.")

model already exist.


# Testing model

In [11]:
nx = [[ 4,5,6,5,5,4,6,7,2,3,4,8,8]]
nxs= scaler.transform(nx)
model.predict(nxs)

array([3], dtype=int64)

In [13]:
# load the model from disk
loaded_model = pickle.load(open('knnmodel_file', 'rb'))
result = loaded_model.predict(x_test)
print(result)

[2 3 3 3 2 1 2 3 1 1 2 1 3 2 1 3 3 2 2 3 1 3 1 3 3 1 2 1 3 1 1 2 1 2 3 3 2
 2 1 3 3 1 1 1 2 3 3 1 3 1 3 2 2 2 1 2 2 1 2 3 3 3 3 3 3 2 1 2 3 3 3 2 2 1
 1 2 1 2 3 3 3 1 1 1 3 1 3 2 3 2 3 2 1 2 2 3 2 2 1 1 3 2 2 3 3 2 3 3 1 2 1
 2 2 2 3 2 3 1 3 2 1 1 1 3 3 3 1 3 2 1 3 3 1 1 2 2 3 3 2 2 3 1 3 3 3 2 1 3
 2 3 3 3 3 2 1 3 2 2 1 2 2 2 2 1 3 1 2 2 1 1 1 3 3 1 2 3 2 2 3 3 2 2 3 1 2
 1 1 3 3 2 2 3 3 2 3 2 1 2 3 3]
