# Imports

In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.feature_selection import mutual_info_classif


# Importing data

In [18]:
data= pd.read_csv('data.csv')
data.head()

Unnamed: 0,patient_number,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,1,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes
1,2,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,No diabetes
2,3,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes
3,4,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes
4,5,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 390 entries, 0 to 389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   patient_number   390 non-null    int64  
 1   cholesterol      390 non-null    int64  
 2   glucose          390 non-null    int64  
 3   hdl_chol         390 non-null    int64  
 4   chol_hdl_ratio   390 non-null    float64
 5   age              390 non-null    int64  
 6   gender           390 non-null    object 
 7   height           390 non-null    int64  
 8   weight           390 non-null    int64  
 9   bmi              390 non-null    float64
 10  systolic_bp      390 non-null    int64  
 11  diastolic_bp     390 non-null    int64  
 12  waist            390 non-null    int64  
 13  hip              390 non-null    int64  
 14  waist_hip_ratio  390 non-null    float64
 15  diabetes         390 non-null    object 
dtypes: float64(3), int64(11), object(2)
memory usage: 48.9+ KB


# Remove unnecessary columns

In [20]:
data.drop("patient_number", inplace=True ,axis=1)
data

Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,3.9,19,female,61,119,22.5,118,70,32,38,0.84,No diabetes
1,146,79,41,3.6,19,female,60,135,26.4,108,58,33,40,0.83,No diabetes
2,217,75,54,4.0,20,female,67,187,29.3,110,72,40,45,0.89,No diabetes
3,226,97,70,3.2,20,female,64,114,19.6,122,64,31,39,0.79,No diabetes
4,164,91,67,2.4,20,female,70,141,20.2,122,86,32,39,0.82,No diabetes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,227,105,44,5.2,83,female,59,125,25.2,150,90,35,40,0.88,No diabetes
386,226,279,52,4.3,84,female,60,192,37.5,144,88,41,48,0.85,Diabetes
387,301,90,118,2.6,89,female,61,115,21.7,218,90,31,41,0.76,No diabetes
388,232,184,114,2.0,91,female,61,127,24.0,170,82,35,38,0.92,Diabetes


# checking for duplicate values

In [21]:
data.duplicated().value_counts()

False    390
dtype: int64

### no duplicates

# Encoding categorical data

In [22]:
label_encoder= LabelEncoder()
data['gender']= label_encoder.fit_transform(data['gender'])
data['diabetes']= label_encoder.fit_transform(data['diabetes'])
data

Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio,diabetes
0,193,77,49,3.9,19,0,61,119,22.5,118,70,32,38,0.84,1
1,146,79,41,3.6,19,0,60,135,26.4,108,58,33,40,0.83,1
2,217,75,54,4.0,20,0,67,187,29.3,110,72,40,45,0.89,1
3,226,97,70,3.2,20,0,64,114,19.6,122,64,31,39,0.79,1
4,164,91,67,2.4,20,0,70,141,20.2,122,86,32,39,0.82,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,227,105,44,5.2,83,0,59,125,25.2,150,90,35,40,0.88,1
386,226,279,52,4.3,84,0,60,192,37.5,144,88,41,48,0.85,0
387,301,90,118,2.6,89,0,61,115,21.7,218,90,31,41,0.76,1
388,232,184,114,2.0,91,0,61,127,24.0,170,82,35,38,0.92,0


# Extract features and target

In [23]:
x= data.iloc[:,:-1]
y= data.iloc[:,-1]

In [24]:
x

Unnamed: 0,cholesterol,glucose,hdl_chol,chol_hdl_ratio,age,gender,height,weight,bmi,systolic_bp,diastolic_bp,waist,hip,waist_hip_ratio
0,193,77,49,3.9,19,0,61,119,22.5,118,70,32,38,0.84
1,146,79,41,3.6,19,0,60,135,26.4,108,58,33,40,0.83
2,217,75,54,4.0,20,0,67,187,29.3,110,72,40,45,0.89
3,226,97,70,3.2,20,0,64,114,19.6,122,64,31,39,0.79
4,164,91,67,2.4,20,0,70,141,20.2,122,86,32,39,0.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
385,227,105,44,5.2,83,0,59,125,25.2,150,90,35,40,0.88
386,226,279,52,4.3,84,0,60,192,37.5,144,88,41,48,0.85
387,301,90,118,2.6,89,0,61,115,21.7,218,90,31,41,0.76
388,232,184,114,2.0,91,0,61,127,24.0,170,82,35,38,0.92


# Checking MI score

In [25]:
discrete_features = x.dtypes == int
mi_scores = mutual_info_classif(x, y, discrete_features=discrete_features)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores

glucose            0.216703
waist_hip_ratio    0.041737
hip                0.039357
chol_hdl_ratio     0.035197
age                0.032765
diastolic_bp       0.021993
bmi                0.021773
cholesterol        0.016798
weight             0.008092
gender             0.000120
hdl_chol           0.000000
height             0.000000
systolic_bp        0.000000
waist              0.000000
Name: MI Scores, dtype: float64

# Splitting the dataset into train and test

In [26]:
x_train, x_test, y_train, y_test= train_test_split(x, y, test_size= 0.15, random_state=0)

# Creating KNN Classifier

In [27]:
classifier= KNeighborsClassifier(n_neighbors=5 , metric='minkowski', p=2 )
classifier.fit(x_train, y_train) 

In [28]:
y_pred= classifier.predict(x_test)

In [31]:
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.78      0.70      0.74        10
           1       0.94      0.96      0.95        49

    accuracy                           0.92        59
   macro avg       0.86      0.83      0.84        59
weighted avg       0.91      0.92      0.91        59



# pickling the model

In [32]:
import pickle
pickle_out = open("classifier.pkl", "wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()