In [2]:
library(caret)
library(mlbench)
library(dplyr)
library(base)
library(caTools)
library(e1071)
library(ROCR)
library(Metrics)
library(ggplot2)
library(rpart)
library(randomForest)
library(class)


# LOADING THE DATASET

In [3]:
pima <- read.csv("C:/Users/Praty/OneDrive/Desktop/pima-indians-diabetes.csv")
head(pima)

X6,X148,X72,X35,X0,X33.6,X0.627,X50,X1
1,85,66,29,0,26.6,0.351,31,0
8,183,64,0,0,23.3,0.672,32,1
1,89,66,23,94,28.1,0.167,21,0
0,137,40,35,168,43.1,2.288,33,1
5,116,74,0,0,25.6,0.201,30,0
3,78,50,32,88,31.0,0.248,26,1


In [25]:
# FINDING THE OUTLIERS BY BOX PLOT
#boxplot(pima$X1~.,data = pima, xlab = 'a', ylab= 'b', main = 'pima data')

In [3]:
# Data cleaning and data information
summary(pima)

       X6              X148            X72             X35       
 Min.   : 0.000   Min.   :  0.0   Min.   :  0.0   Min.   : 0.00  
 1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.0   1st Qu.: 0.00  
 Median : 3.000   Median :117.0   Median : 72.0   Median :23.00  
 Mean   : 3.842   Mean   :120.9   Mean   : 69.1   Mean   :20.52  
 3rd Qu.: 6.000   3rd Qu.:140.0   3rd Qu.: 80.0   3rd Qu.:32.00  
 Max.   :17.000   Max.   :199.0   Max.   :122.0   Max.   :99.00  
       X0            X33.6           X0.627            X50       
 Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00  
 1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2435   1st Qu.:24.00  
 Median : 32.0   Median :32.00   Median :0.3710   Median :29.00  
 Mean   : 79.9   Mean   :31.99   Mean   :0.4717   Mean   :33.22  
 3rd Qu.:127.5   3rd Qu.:36.60   3rd Qu.:0.6250   3rd Qu.:41.00  
 Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00  
       X1        
 Min.   :0.0000  
 1st Qu.:0.0000  
 Median :0.0000  
 Mea

In [5]:
# checking the datatypes of variables
sapply(pima, class)

In [6]:
# Finding the missing values in pima dataset
head(sum(is.na(pima)))

In [22]:
nom1 = preProcess(pima[,c(1:9)], method=c('center','scale'))
#nom1 = preProcess(pima[,c(1:9)], method=c('range'))
norm1 = predict(nom1, pima[,c(1:9)])


# APPLYING MACHINE LEARNING MODELS

In [20]:
#APPLYING MODELS
Train = createDataPartition(norm1$X1, p =0.8, list = FALSE)
train_data = pima[Train,]
test_data = pima[-Train,]

# LOGISTIC REGRESSION

In [23]:
glmModel = train(factor(X1)~., data = train_data,method='glm',family='binomial')
predictTest = predict(glmModel, newdata = test_data)
confusionMatrix(predictTest, factor(test_data$X1), mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 89 26
         1  8 30
                                          
               Accuracy : 0.7778          
                 95% CI : (0.7036, 0.8409)
    No Information Rate : 0.634           
    P-Value [Acc > NIR] : 9.386e-05       
                                          
                  Kappa : 0.4863          
                                          
 Mcnemar's Test P-Value : 0.003551        
                                          
            Sensitivity : 0.9175          
            Specificity : 0.5357          
         Pos Pred Value : 0.7739          
         Neg Pred Value : 0.7895          
              Precision : 0.7739          
                 Recall : 0.9175          
                     F1 : 0.8396          
             Prevalence : 0.6340          
         Detection Rate : 0.5817          
   Detection Prevalence : 0.7516          
      Balanced Accuracy : 0.7266    

# DECISION TREES

In [24]:
DT = train(factor(X1)~., data = train_data, method = 'rpart')
pt1 = predict(DT, newdata = test_data)
cf1 = confusionMatrix(pt1, reference = factor(test_data$X1), mode = 'everything')
cf1

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 88 26
         1  9 30
                                          
               Accuracy : 0.7712          
                 95% CI : (0.6965, 0.8352)
    No Information Rate : 0.634           
    P-Value [Acc > NIR] : 0.0001906       
                                          
                  Kappa : 0.4733          
                                          
 Mcnemar's Test P-Value : 0.0068409       
                                          
            Sensitivity : 0.9072          
            Specificity : 0.5357          
         Pos Pred Value : 0.7719          
         Neg Pred Value : 0.7692          
              Precision : 0.7719          
                 Recall : 0.9072          
                     F1 : 0.8341          
             Prevalence : 0.6340          
         Detection Rate : 0.5752          
   Detection Prevalence : 0.7451          
      Balanced Accuracy : 0.7215    

# KNN CLASSIFIER

In [25]:
KNN <- knn(train = train_data, test = test_data, cl = train_data$X1, k = 20) 
m1 = mean(KNN != train_data$X1)
print(1-m1)

"longer object length is not a multiple of shorter object length"

[1] 0.5602606


# NAIVE BAYES CLASSIFIER

In [26]:
NAIVE_BAYES = naiveBayes(factor(X1) ~ ., data = train_data) 
predictTest2 = predict(NAIVE_BAYES,newdata=test_data)
cm = table(test_data$X1, predictTest2)
confusionMatrix(cm,mode='everything')

Confusion Matrix and Statistics

   predictTest2
     0  1
  0 87 10
  1 20 36
                                          
               Accuracy : 0.8039          
                 95% CI : (0.7321, 0.8636)
    No Information Rate : 0.6993          
    P-Value [Acc > NIR] : 0.002339        
                                          
                  Kappa : 0.5609          
                                          
 Mcnemar's Test P-Value : 0.100348        
                                          
            Sensitivity : 0.8131          
            Specificity : 0.7826          
         Pos Pred Value : 0.8969          
         Neg Pred Value : 0.6429          
              Precision : 0.8969          
                 Recall : 0.8131          
                     F1 : 0.8529          
             Prevalence : 0.6993          
         Detection Rate : 0.5686          
   Detection Prevalence : 0.6340          
      Balanced Accuracy : 0.7978          
                  

# SUPPORT VECTOR MACHINE

In [30]:
svmfit = svm(factor(X1)~ ., data = train_data, kernel = "linear", cost = 10, scale = FALSE)
pt2 = predict(svmfit, newdata=test_data)
confusionMatrix(pt2, reference = factor(test_data$X1), mode='everything')

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 90 24
         1  7 32
                                         
               Accuracy : 0.7974         
                 95% CI : (0.7249, 0.858)
    No Information Rate : 0.634          
    P-Value [Acc > NIR] : 9.013e-06      
                                         
                  Kappa : 0.5335         
                                         
 Mcnemar's Test P-Value : 0.004057       
                                         
            Sensitivity : 0.9278         
            Specificity : 0.5714         
         Pos Pred Value : 0.7895         
         Neg Pred Value : 0.8205         
              Precision : 0.7895         
                 Recall : 0.9278         
                     F1 : 0.8531         
             Prevalence : 0.6340         
         Detection Rate : 0.5882         
   Detection Prevalence : 0.7451         
      Balanced Accuracy : 0.7496         
              

# RANDOM FOREST

In [31]:
forest = randomForest(factor(X1)~., data = train_data)

In [32]:
pt = predict(forest, newdata= test_data)

In [33]:
confusionMatrix(pt, reference = factor(test_data$X1), mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 87 23
         1 10 33
                                          
               Accuracy : 0.7843          
                 95% CI : (0.7106, 0.8466)
    No Information Rate : 0.634           
    P-Value [Acc > NIR] : 4.461e-05       
                                          
                  Kappa : 0.5113          
                                          
 Mcnemar's Test P-Value : 0.03671         
                                          
            Sensitivity : 0.8969          
            Specificity : 0.5893          
         Pos Pred Value : 0.7909          
         Neg Pred Value : 0.7674          
              Precision : 0.7909          
                 Recall : 0.8969          
                     F1 : 0.8406          
             Prevalence : 0.6340          
         Detection Rate : 0.5686          
   Detection Prevalence : 0.7190          
      Balanced Accuracy : 0.7431    

# XGBOOST

In [34]:
control6 = trainControl(method = 'repeatedcv',number = 10,repeats = 3, returnResamp = "all",savePredictions = "all")

XGBoost = train(factor(X1)~., data = train_data , method="xgbLinear", 
                      trControl =control6,verbose = FALSE)
predictTest5 = predict(XGBoost, newdata = test_data)
confusionMatrix(predictTest5,reference = factor(test_data$X1),mode = 'everything')

Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 83 24
         1 14 32
                                          
               Accuracy : 0.7516          
                 95% CI : (0.6754, 0.8179)
    No Information Rate : 0.634           
    P-Value [Acc > NIR] : 0.001296        
                                          
                  Kappa : 0.4438          
                                          
 Mcnemar's Test P-Value : 0.144292        
                                          
            Sensitivity : 0.8557          
            Specificity : 0.5714          
         Pos Pred Value : 0.7757          
         Neg Pred Value : 0.6957          
              Precision : 0.7757          
                 Recall : 0.8557          
                     F1 : 0.8137          
             Prevalence : 0.6340          
         Detection Rate : 0.5425          
   Detection Prevalence : 0.6993          
      Balanced Accuracy : 0.7135    