# Diabetes Prediction Using Logistic Regression

This notebook builds a model to predict diabetes using logistic regression

### First Import relevant files and import data


In [3]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score as precision
from sklearn.metrics import recall_score as recall
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('diabetes_coursera.csv')

### Separate X and Y Variables

In [23]:
X = df.drop(columns='Diabetes_binary')
Y = df['Diabetes_binary']
scalerLR = MinMaxScaler()
#this initializes scaler we will use to standardize data


### Standardize the data to ensure model does not get messed up with different scales

In [None]:

X_lr_standardized = scalerLR.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_lr_standardized,Y,test_size=.3)
lr = LogisticRegression(solver='liblinear').fit(X_train, y_train)


### Create logistic regression model

In [None]:

lr_l1 = LogisticRegressionCV(Cs=10, cv=5, penalty='l2', solver='liblinear').fit(X_train, y_train)
coef_df = pd.DataFrame({
'Feature': X.columns,
'Coefficient': lr_l1.coef_[0]
})



### See the coefficients, which have the strongest effect on whether someone has diabetes

In [32]:
print(coef_df)
y_pred = lr_l1.predict(X_test)
recall_lr_l1 = recall(y_test,y_pred)
print("RECALL SCORE" + str(recall_lr_l1))

                 Feature  Coefficient
0                 HighBP     0.735502
1               HighChol     0.602741
2              CholCheck     1.400776
3                    BMI     6.480655
4                 Smoker    -0.024289
5                 Stroke     0.179593
6   HeartDiseaseorAttack     0.265110
7           PhysActivity    -0.038667
8                 Fruits    -0.044519
9                Veggies    -0.081778
10     HvyAlcoholConsump    -0.766414
11         AnyHealthcare     0.043115
12           NoDocbcCost     0.025442
13               GenHlth     2.277045
14              MentHlth    -0.126180
15              PhysHlth    -0.196887
16              DiffWalk     0.096142
17                   Sex     0.278938
18                   Age     1.836764
19             Education    -0.277712
20                Income    -0.390869
RECALL SCORE0.77566720486276


### See the [PDF Report](classification_report.pdf) for detailed conclusions of all three models (Logistic, SVM, Random Forests)