# Predicting Diabetes

In [2]:
from pathlib import Path
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
data = Path('../Resources/diabetes.csv')
df = pd.read_csv(data)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Separate the Features (X) from the Target (y)

In [6]:
X = df.drop('Outcome', 1)
y = df['Outcome']

## Split our data into training and testing

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(576, 8)

## Create a Logistic Regression Model

In [24]:
from sklearn.linear_model import LogisticRegression
db_classifier = LogisticRegression(solver='lbfgs', random_state=5)
db_classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Fit (train) or model using the training data

In [25]:
db_classifier.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## Score the model using the test data

In [26]:
print("Training Data Score is: " + f'{db_classifier.score(X_train, Y_train)}')
print("Testing Data Score is: " + f'{db_classifier.score(X_test, Y_test)}')

Training Data Score is: 0.7673611111111112
Testing Data Score is: 0.7395833333333334


## Make predictions

In [37]:
Result = pd.DataFrame({'Prediction': db_classifier.predict(X_test), 'Actual': Y_test})
Result[Result['Actual']!=Result['Prediction']].count()

Prediction    50
Actual        50
dtype: int64

In [38]:
Result.count()

Prediction    192
Actual        192
dtype: int64

50 cases out of 192 samples didnt have the actual and predition matching. indicating 74% model accuracy.

In [46]:
from sklearn.metrics import confusion_matrix
c_matrix=pd.DataFrame(confusion_matrix(Y_test, Result['Prediction']), columns=['Predicted_No', 'Predicted_Yes'])
c_matrix.index=['Actual_No', 'Actual_Yes']
c_matrix

Unnamed: 0,Predicted_No,Predicted_Yes
Actual_No,111,14
Actual_Yes,36,31


In [44]:
from sklearn.metrics import classification_report
target_names = ["Daibetic", "non_Daibetic"]
print(classification_report(Y_test, Result['Prediction'], target_names=target_names))

              precision    recall  f1-score   support

    Daibetic       0.76      0.89      0.82       125
non_Daibetic       0.69      0.46      0.55        67

    accuracy                           0.74       192
   macro avg       0.72      0.68      0.68       192
weighted avg       0.73      0.74      0.72       192

