# Installing libraries

In [2]:
install.packages("tidyverse")
install.packages("caret")
install.packages("mlbench")

# Calling libraries & Load data

In [2]:
library(tidyverse)
library(caret)

# Load the data and remove NAs
data("PimaIndiansDiabetes2", package = "mlbench")
PimaIndiansDiabetes2 <- na.omit(PimaIndiansDiabetes2)

# Inspect data (training and test)

In [4]:
# Inspect the data
sample_n(PimaIndiansDiabetes2, 3)
# Split the data into training and test set
set.seed(123)
training.samples <- PimaIndiansDiabetes2$diabetes %>% 
  createDataPartition(p = 0.8, list = FALSE)
train.data  <- PimaIndiansDiabetes2[training.samples, ]
test.data <- PimaIndiansDiabetes2[-training.samples, ]

Unnamed: 0,pregnant,glucose,pressure,triceps,insulin,mass,pedigree,age,diabetes
291,0,78,88,29,40,36.9,0.434,21,neg
610,1,111,62,13,182,24.0,0.138,23,neg
639,7,97,76,32,91,40.9,0.871,32,pos


# Create a Generalized Linear Model (1º Stage?)

In [6]:
library(MASS)
# Fit the model
model <- glm(diabetes ~., data = train.data, family = binomial) %>%
  stepAIC(trace = FALSE)
# Summarize the final selected model
summary(model)


Call:
glm(formula = diabetes ~ glucose + mass + pedigree + age, family = binomial, 
    data = train.data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-2.7385  -0.6620  -0.3742   0.6772   2.6080  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -9.561248   1.171969  -8.158 3.40e-16 ***
glucose      0.037917   0.005565   6.814 9.49e-12 ***
mass         0.052301   0.021361   2.448 0.014347 *  
pedigree     0.969714   0.453823   2.137 0.032617 *  
age          0.052888   0.015385   3.438 0.000587 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 398.80  on 313  degrees of freedom
Residual deviance: 280.83  on 309  degrees of freedom
AIC: 290.83

Number of Fisher Scoring iterations: 5


# Test the model accuracy

In [5]:
# Make predictions
probabilities <- model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Model accuracy
mean(predicted.classes==test.data$diabetes)

# Model for the 2º Stage

In [7]:
full.model <- glm(diabetes ~., data = train.data, family = binomial)
coef(full.model)

step.model <- full.model %>% stepAIC(trace = FALSE)
coef(step.model)

# Make predictions
probabilities <- full.model %>% predict(test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Prediction accuracy
observed.classes <- test.data$diabetes
mean(predicted.classes == observed.classes)

In [8]:
# Make predictions
probabilities <- predict(step.model, test.data, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
# Prediction accuracy
observed.classes <- test.data$diabetes
mean(predicted.classes == observed.classes)