In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn import tree
sklearn.__version__
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#import dataset
dataset = pd.read_csv('dataset.csv')

In [3]:
dataset.head()

Unnamed: 0,Age,BMI,Carbohydrate,Protein,Fat,Water Consumption,Foot Steps,Sleep Hours,Diabetes Pedigree Function,Output
0,50,33.6,48,53,61,3,11992,8,0.627,1
1,31,26.6,61,55,71,2,8241,8,0.351,0
2,32,23.3,62,56,64,2,10276,7,0.672,1
3,21,28.1,52,53,75,2,7886,8,0.167,0
4,33,43.1,63,48,57,3,12336,8,2.288,1


In [4]:
#indepedent and depedent features
X = dataset.iloc[:, 0:9].values
y = dataset.iloc[:, 9].values
dataset.shape

(757, 10)

In [5]:
#checking for missing values
dataset.isnull().sum()

Age                           0
BMI                           0
Carbohydrate                  0
Protein                       0
Fat                           0
Water Consumption             0
Foot Steps                    0
Sleep Hours                   0
Diabetes Pedigree Function    0
Output                        0
dtype: int64

In [6]:
#splitting the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.25, random_state=0)

In [7]:
#feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [8]:
#model fitting Logistic Regression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)  

LogisticRegression(random_state=0)

In [9]:
#predicting value
y_pred = classifier.predict(X_test)
y_pred

array([0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0])

In [10]:
#confusion matrix
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
cm 

array([[104,  19],
       [ 39,  28]])

In [11]:
#Accuracy
accuracy_report = classification_report(y_test,y_pred)
print(accuracy_report)

              precision    recall  f1-score   support

           0       0.73      0.85      0.78       123
           1       0.60      0.42      0.49        67

    accuracy                           0.69       190
   macro avg       0.66      0.63      0.64       190
weighted avg       0.68      0.69      0.68       190



In [12]:
#decision Tree
dec_classifier = DecisionTreeClassifier(random_state=0)
dec_classifier.fit(X_train,y_train)

DecisionTreeClassifier(random_state=0)

In [13]:
dec_classifier_predict = dec_classifier.predict(X_test)
dec_classifier_predict

array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0])

In [14]:
#confusion matrix for decision tree
dec_classifier_cm = confusion_matrix(y_test,dec_classifier_predict)
dec_classifier_cm

array([[90, 33],
       [39, 28]])

In [15]:
#classification report for decision tree model
dec_classifier_report = classification_report(y_test,dec_classifier_predict)
print(dec_classifier_report)

              precision    recall  f1-score   support

           0       0.70      0.73      0.71       123
           1       0.46      0.42      0.44        67

    accuracy                           0.62       190
   macro avg       0.58      0.57      0.58       190
weighted avg       0.61      0.62      0.62       190



In [16]:
ran_classifier = RandomForestClassifier(n_estimators=100,criterion='entropy',random_state=0)
ran_classifier.fit(X_train,y_train)

RandomForestClassifier(criterion='entropy', random_state=0)

In [17]:
ran_classifier_predict = ran_classifier.predict(X_test)

In [18]:
#confusion Matrix for Random forest
ran_classifier_cm = confusion_matrix(y_test,ran_classifier_predict)
ran_classifier_cm

array([[102,  21],
       [ 40,  27]])

In [19]:
#classification report for random forest
ran_classifier_report = classification_report(y_test,ran_classifier_predict)
print(ran_classifier_report)

              precision    recall  f1-score   support

           0       0.72      0.83      0.77       123
           1       0.56      0.40      0.47        67

    accuracy                           0.68       190
   macro avg       0.64      0.62      0.62       190
weighted avg       0.66      0.68      0.66       190



In [20]:
#Applying K fold cross validation
from sklearn.model_selection import cross_val_score
accuracy= cross_val_score(estimator = ran_classifier, X = X_train, y= y_train, cv = 50)

In [21]:
accuracy.mean()

0.7001515151515152

In [22]:
#----------------------------Predicting New values-----------------------# 
#Age : 51 | BMI : 35 | carbo : 40 | protein : 48 | fat : 50 | water consumpation in liters : 4 | footsteps : 8400 |
#sleep hours : 7 | Diabetes Pedigree Function : 0.190
new_prediction= classifier.predict(sc_X.transform(np.array([[51, 35, 40, 48, 50, 4, 8400 , 7, 0.190]])))
new_prediction

array([1])