#                                                      KNN Model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
dia = pd.read_csv('diabetes.csv')

In [3]:
dia.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [4]:
dia.shape

(2000, 9)

In [5]:
dia.info() # No Null Values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
Pregnancies                 2000 non-null int64
Glucose                     2000 non-null int64
BloodPressure               2000 non-null int64
SkinThickness               2000 non-null int64
Insulin                     2000 non-null int64
BMI                         2000 non-null float64
DiabetesPedigreeFunction    2000 non-null float64
Age                         2000 non-null int64
Outcome                     2000 non-null int64
dtypes: float64(2), int64(7)
memory usage: 140.7 KB


In [6]:
dia.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
dia.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0
5,0,173,78,32,265,46.5,1.159,58,0
6,4,99,72,17,0,25.6,0.294,28,0
7,8,194,80,0,0,26.1,0.551,67,0
8,2,83,65,28,66,36.8,0.629,24,0
9,2,89,90,30,0,33.5,0.292,42,0


### Values of columns like 'Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin' cannot be accepted as ZERO as it will affect the outcome.
### We can replace such values with the mean of that column bu first calculating the mean of individual column without Zero or by making it NAN in place of zero and then put mean value there.

## Replace Zero

In [8]:
zero_not_accepted = ['Glucose','BMI', 'Insulin', 'BloodPressure', 'SkinThickness']

In [9]:
zero_not_accepted

['Glucose', 'BMI', 'Insulin', 'BloodPressure', 'SkinThickness']

In [10]:
for column in zero_not_accepted:
    dia[column]= dia[column].replace(0, np.NaN)
    mean = int(dia[column].mean(skipna=True))
    dia[column] = dia[column].replace(np.NaN, mean)

In [11]:
X = dia.iloc[:,0:8]
y = dia.iloc[:,8]

In [12]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,2,138.0,62.0,35.0,153.0,33.6,0.127,47
1,0,84.0,82.0,31.0,125.0,38.2,0.233,23
2,0,145.0,72.0,29.0,153.0,44.2,0.63,31
3,0,135.0,68.0,42.0,250.0,42.3,0.365,24
4,1,139.0,62.0,41.0,480.0,40.7,0.536,21


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=100)

### Scaling is needed as our data is fluctuating. Eg. In some columns the value are less and in some of the columns values are high in numbers

In [14]:
sc_X= StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  app.launch_new_instance()


### Using below code we can take the value of K but in our dataset we have big data of 2000 rows so it is showing K=19 so for now we are taking k=19 but for data having more than 500 rows we don't use KNN for classification.
### P=2 means two classes

In [15]:
import math
math.sqrt(len(y_test))

20.0

In [16]:
model = KNeighborsClassifier(n_neighbors=19, p=2, metric= 'euclidean')

In [17]:
model.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=None, n_neighbors=19, p=2,
           weights='uniform')

In [18]:
y_pred = model.predict(X_test)

In [19]:
confusion_matrix(y_pred, y_test)

array([[253,  34],
       [ 44,  69]])

In [20]:
f1_score(y_test, y_pred)

0.638888888888889

In [21]:
accuracy_score(y_test, y_pred)

0.805

## Using Logistic regression

In [22]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression()
model1.fit(X_train,y_train)
y_pred1=model1.predict(X_test)
accuracy_score(y_pred1, y_test)



0.785