In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import scale

In [5]:
diabetes = pd.read_csv('diabetes.csv')

In [6]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
diabetes.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
#Clean the data and remove missing values. 

In [9]:
def outlier(input, mean, std):
    if ((mean + 3*std) < input) or ((mean - 3*std)> input):
        return [True, input]
    else:
        return [False, input]


 #go through each column in the list, if in that column, the value is an outlier, append it to the list. if not, move   
for x in list(diabetes.columns):
    try:
        outlier_list = []
        print(x)
        for y in diabetes[x]:
            if outlier(y,diabetes[x].mean(),diabetes[x].std())[0]:
                outlier_list.append(outlier(y,diabetes[x].mean(),diabetes[x].std())[1])
        print(outlier_list)
    except:
        continue

Pregnancies
[15, 17, 14, 14]
Glucose
[0, 0, 0, 0, 0]
BloodPressure
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
SkinThickness
[99]
Insulin
[543, 846, 495, 485, 495, 478, 744, 680, 545, 465, 579, 474, 480, 600, 440, 540, 480, 510]
BMI
[0.0, 0.0, 0.0, 0.0, 0.0, 67.1, 0.0, 0.0, 59.4, 0.0, 0.0, 57.3, 0.0, 0.0]
DiabetesPedigreeFunction
[2.288, 1.893, 1.781, 2.329, 1.476, 2.137, 1.731, 1.6, 2.42, 1.699, 1.698]
Age
[69, 72, 81, 70, 69]
Outcome
[]


In [21]:
diabetes['Outcome'].isna().value_counts()
 # nonulls


False    768
Name: Outcome, dtype: int64

In [None]:
#Drop any column that is not categorical or numeric. 
#Considering dropping skinthickness. Noah mentioned this when we did powerBI

In [None]:
#Howwould someone have 0 BMI, bloodpressure, etc? lol. Removing when it's 0 for all of them.
diabetes = diabetes[(diabetes['BMI']!=0) & (diabetes['BloodPressure'] != 0) & (diabetes['Glucose']!=0) & (diabetes['SkinThickness']!=0) & (diabetes['Insulin']!=0)]


In [22]:
#Generate dummy variables for the categorical features.
#I guess pregnancy and age is categorical?
diabetes = pd.get_dummies(diabetes, prefix='', prefix_sep='', columns=["Pregnancies", "Age"], drop_first=True)


In [24]:
# Separate the independent variables from the dependent variables.
#clearly outcome is dependent.
X = scale(diabetes.drop(columns=['Outcome']))
y = diabetes["Outcome"]

In [25]:
#Create a training set that's 75% of your data set and a complementary test set with the remaining 25%. 
# Specify random_state=0.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.25,random_state=0)

X_train = scale(X_train)

X_test = scale(X_test)

In [26]:
#Train the model using the LogisticRegressionCV class, which has cross-validation capability built in.
#  Use this to tune your Cs hyperparameter.
#  What value of this hyperparameter performs best in cross-validation?

log_ML = LogisticRegressionCV(max_iter=250,Cs=3,fit_intercept=True).fit(X_train, y_train)
print(log_ML.coef_)
print(log_ML.n_iter_)

[[ 1.16371383 -0.31740885  0.14624085 -0.11366593  0.6298893   0.21321051
  -0.16050633 -0.20604494  0.03772396 -0.13110803 -0.05076835 -0.14684413
  -0.1101909   0.19633169  0.12988812 -0.03151654 -0.06921007  0.10622101
  -0.0204146   0.28761878  0.12004872  0.12718975 -0.09915084  0.12829643
   0.01297167  0.26286516 -0.05421413  0.17326523  0.07309465  0.22418567
   0.16480397  0.25962494  0.331302    0.2695879   0.03531779  0.18849928
   0.25063444  0.1485731   0.34443468  0.04044482  0.16836059  0.31825911
   0.15095432  0.45561037  0.15403704  0.24458511  0.17887373  0.08641247
  -0.25702382  0.02648963  0.2663228   0.16172853  0.41806501  0.17766109
   0.21353385  0.07082689  0.17271398 -0.05791068  0.02370528  0.05620565
   0.07322525  0.08565486  0.38108171 -0.21418804 -0.14729426 -0.23489126
   0.11017974 -0.01699679 -0.14153131 -0.15289111  0.25066954 -0.13331663
  -0.16960953]]
[[[ 8 46 49]
  [ 8 33 47]
  [ 8 44 53]
  [ 8 38 45]
  [ 8 44 55]]]


In [27]:
#After cross-validation, use your model to generate predictions on the test set,
# Print the Matthews correlation coefficient.
predictions= log_ML.predict(X_test)


In [28]:
#  then create a confusion matrix from those results. 
sklearn.metrics.confusion_matrix(y_test,predictions,normalize='true')


array([[0.89230769, 0.10769231],
       [0.38709677, 0.61290323]])

In [29]:
# Print the Matthews correlation coefficient.
sklearn.metrics.matthews_corrcoef(y_test,predictions)


0.5315872534627454