In [1]:
library(caTools) # train-test dataset
library(caret) # confusionMatrix

# predictive models
library(kknn)
library(rpart)
library(randomForest)
library(e1071)

Cargando paquete requerido: ggplot2

Cargando paquete requerido: lattice


Adjuntando el paquete: 'kknn'


The following object is masked from 'package:caret':

    contr.dummy


randomForest 4.7-1.1

Type rfNews() to see new features/changes/bug fixes.


Adjuntando el paquete: 'randomForest'


The following object is masked from 'package:ggplot2':

    margin




In [2]:
df <- read.csv("student_data_cleaned.csv", stringsAsFactors = TRUE)

In [3]:
str(df)

'data.frame':	25574 obs. of  6 variables:
 $ study.hrs.perwk           : num  9.3 17.6 8.8 8.8 17.9 13.8 7.7 1.4 7.2 4.9 ...
 $ attendance.rate           : num  95.3 76.8 89.3 73.8 38.6 95.8 54.1 66.5 54.4 71.1 ...
 $ previous.grades           : num  60.6 62.4 72.7 69.3 93.6 59.2 72.3 49.2 55.9 98 ...
 $ extracurricular.activities: Factor w/ 2 levels "No","Yes": 1 2 1 2 1 2 1 1 2 2 ...
 $ parent.edu.level          : Factor w/ 5 levels "Associate","Bachelor",..: 4 2 5 4 3 3 5 1 4 4 ...
 $ passed                    : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 2 1 1 ...


In [4]:
set.seed(123)

# split of 70-30
div.obs <- sample.split(df$passed, SplitRatio = 0.7)
trainset <- df[div.obs, ]
testset <- df[!div.obs, ]

In [5]:
set.seed(123)

model_kknn <- train.kknn(passed ~ ., data = trainset, kmax = floor(sqrt(dim(df)[1])))
kknn_predict <- predict(model_kknn, testset[, -6])

In [6]:
cm_kknn <- confusionMatrix(kknn_predict, testset$passed, positive = "Yes")
cm_kknn

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1950 1937
       Yes 1919 1866
                                          
               Accuracy : 0.4974          
                 95% CI : (0.4861, 0.5086)
    No Information Rate : 0.5043          
    P-Value [Acc > NIR] : 0.8891          
                                          
                  Kappa : -0.0053         
                                          
 Mcnemar's Test P-Value : 0.7843          
                                          
            Sensitivity : 0.4907          
            Specificity : 0.5040          
         Pos Pred Value : 0.4930          
         Neg Pred Value : 0.5017          
             Prevalence : 0.4957          
         Detection Rate : 0.2432          
   Detection Prevalence : 0.4934          
      Balanced Accuracy : 0.4973          
                                          
       'Positive' Class : Yes             
                        

In [7]:
set.seed(123)

model_rpart <- rpart(passed ~ ., data = trainset)
rpart_predict <- predict(model_rpart, testset, type = "class")

In [8]:
cm_rpart <- confusionMatrix(rpart_predict, testset$passed, positive = "Yes")
cm_rpart

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  3869 3803
       Yes    0    0
                                         
               Accuracy : 0.5043         
                 95% CI : (0.493, 0.5156)
    No Information Rate : 0.5043         
    P-Value [Acc > NIR] : 0.5046         
                                         
                  Kappa : 0              
                                         
 Mcnemar's Test P-Value : <2e-16         
                                         
            Sensitivity : 0.0000         
            Specificity : 1.0000         
         Pos Pred Value :    NaN         
         Neg Pred Value : 0.5043         
             Prevalence : 0.4957         
         Detection Rate : 0.0000         
   Detection Prevalence : 0.0000         
      Balanced Accuracy : 0.5000         
                                         
       'Positive' Class : Yes            
                                         

In [9]:
set.seed(123)

model_randomForest <- randomForest(passed ~ ., data = trainset, importance = TRUE)
randomForest_predict <- predict(model_randomForest, testset[, -6])

In [10]:
cm_randomForest <- confusionMatrix(randomForest_predict, testset$passed, positive = "Yes")
cm_randomForest

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2035 1982
       Yes 1834 1821
                                          
               Accuracy : 0.5026          
                 95% CI : (0.4914, 0.5139)
    No Information Rate : 0.5043          
    P-Value [Acc > NIR] : 0.62107         
                                          
                  Kappa : 0.0048          
                                          
 Mcnemar's Test P-Value : 0.01733         
                                          
            Sensitivity : 0.4788          
            Specificity : 0.5260          
         Pos Pred Value : 0.4982          
         Neg Pred Value : 0.5066          
             Prevalence : 0.4957          
         Detection Rate : 0.2374          
   Detection Prevalence : 0.4764          
      Balanced Accuracy : 0.5024          
                                          
       'Positive' Class : Yes             
                        

In [11]:
set.seed(123)

model_kLinear <- svm(passed ~., data = trainset, kernel = "linear")
kLinear_predict <- predict(model_kLinear, testset)

In [12]:
cm_kLinear <- confusionMatrix(kLinear_predict, testset$passed, positive = "Yes")
cm_kLinear

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2333 2322
       Yes 1536 1481
                                          
               Accuracy : 0.4971          
                 95% CI : (0.4859, 0.5084)
    No Information Rate : 0.5043          
    P-Value [Acc > NIR] : 0.8975          
                                          
                  Kappa : -0.0076         
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.3894          
            Specificity : 0.6030          
         Pos Pred Value : 0.4909          
         Neg Pred Value : 0.5012          
             Prevalence : 0.4957          
         Detection Rate : 0.1930          
   Detection Prevalence : 0.3932          
      Balanced Accuracy : 0.4962          
                                          
       'Positive' Class : Yes             
                        

In [13]:
set.seed(123)

model_kRadial <- svm(passed ~., data = trainset, kernel = "radial")
kRadial_predict <- predict(model_kRadial, testset)

In [14]:
cm_kRadial <- confusionMatrix(kRadial_predict, testset$passed, positive = "Yes")
cm_kRadial

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  2234 2238
       Yes 1635 1565
                                          
               Accuracy : 0.4952          
                 95% CI : (0.4839, 0.5064)
    No Information Rate : 0.5043          
    P-Value [Acc > NIR] : 0.9463          
                                          
                  Kappa : -0.0111         
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.4115          
            Specificity : 0.5774          
         Pos Pred Value : 0.4891          
         Neg Pred Value : 0.4996          
             Prevalence : 0.4957          
         Detection Rate : 0.2040          
   Detection Prevalence : 0.4171          
      Balanced Accuracy : 0.4945          
                                          
       'Positive' Class : Yes             
                        

In [15]:
set.seed(123)

model_kPolynomial <- svm(passed ~., data = trainset, kernel = "polynomial")
kPolynomial_predict <- predict(model_kPolynomial, testset)

In [16]:
cm_kPolynomial <- confusionMatrix(kPolynomial_predict, testset$passed, positive = "Yes")
cm_kPolynomial

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  3012 3010
       Yes  857  793
                                          
               Accuracy : 0.496           
                 95% CI : (0.4847, 0.5072)
    No Information Rate : 0.5043          
    P-Value [Acc > NIR] : 0.9296          
                                          
                  Kappa : -0.013          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.2085          
            Specificity : 0.7785          
         Pos Pred Value : 0.4806          
         Neg Pred Value : 0.5002          
             Prevalence : 0.4957          
         Detection Rate : 0.1034          
   Detection Prevalence : 0.2151          
      Balanced Accuracy : 0.4935          
                                          
       'Positive' Class : Yes             
                        

In [18]:
set.seed(123)

model_kSigmoid <- svm(passed ~., data = trainset, kernel = "sigmoid")
kSigmoid_predict <- predict(model_kSigmoid, testset)

In [19]:
cm_kSigmoid <- confusionMatrix(kSigmoid_predict, testset$passed, positive = "Yes")
cm_kSigmoid

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  1953 1903
       Yes 1916 1900
                                         
               Accuracy : 0.5022         
                 95% CI : (0.491, 0.5135)
    No Information Rate : 0.5043         
    P-Value [Acc > NIR] : 0.6468         
                                         
                  Kappa : 0.0044         
                                         
 Mcnemar's Test P-Value : 0.8460         
                                         
            Sensitivity : 0.4996         
            Specificity : 0.5048         
         Pos Pred Value : 0.4979         
         Neg Pred Value : 0.5065         
             Prevalence : 0.4957         
         Detection Rate : 0.2477         
   Detection Prevalence : 0.4974         
      Balanced Accuracy : 0.5022         
                                         
       'Positive' Class : Yes            
                                         

In [20]:
extract_metrics <- function(cm) {
  c(
    Accuracy = cm$overall["Accuracy"],
    Sensitivity = cm$byClass["Sensitivity"],
    Specificity = cm$byClass["Specificity"],
    Pos_Pred_Value = cm$byClass["Pos Pred Value"],
    Neg_Pred_Value = cm$byClass["Neg Pred Value"],
    Prevalence = cm$byClass["Prevalence"],
    Detection_Rate = cm$byClass["Detection Rate"],
    Detection_Prevalence = cm$byClass["Detection Prevalence"],
    Balanced_Accuracy = cm$byClass["Balanced Accuracy"]
  )
}

# create a dataframe with the metrics of every model
metrics_df <- data.frame(
  Metric = c("Accuracy", "Sensitivity", "Specificity", "Pos Pred Value", 
             "Neg Pred Value", "Prevalence", "Detection Rate", 
             "Detection Prevalence", "Balanced Accuracy"),
  kknn = extract_metrics(cm_kknn),
  Rpart = extract_metrics(cm_rpart),
  RandomForest = extract_metrics(cm_randomForest),
  Linear = extract_metrics(cm_kLinear),
  Radial = extract_metrics(cm_kRadial),
  Polynomial = extract_metrics(cm_kPolynomial),
  Sigmoid = extract_metrics(cm_kSigmoid)
)


# transpose the dataframe to make models rows and metrics columns
metrics_df <- t(metrics_df)

# convert the result to a dataframe
metrics_df <- as.data.frame(metrics_df)

# asign the column names correctly
colnames(metrics_df) <- metrics_df[1, ]  # take the first row as column names
metrics_df <- metrics_df[-1, ]

metrics_df

Unnamed: 0_level_0,Accuracy,Sensitivity,Specificity,Pos Pred Value,Neg Pred Value,Prevalence,Detection Rate,Detection Prevalence,Balanced Accuracy
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
kknn,0.4973931,0.4906653,0.5040062,0.4929987,0.5016722,0.4956986,0.2432221,0.4933525,0.4973357
Rpart,0.5043014,0.0,1.0,,0.5043014,0.4956986,0.0,0.0,0.5
RandomForest,0.5026069,0.4788325,0.5259757,0.4982216,0.506597,0.4956986,0.2373566,0.4764077,0.5024041
Linear,0.4971324,0.3894294,0.6029982,0.490885,0.5011815,0.4956986,0.1930396,0.3932482,0.4962138
Radial,0.4951773,0.4115172,0.5774102,0.4890625,0.4995528,0.4956986,0.2039885,0.4171011,0.4944637
Polynomial,0.4959593,0.2085196,0.7784957,0.4806061,0.5001661,0.4956986,0.1033629,0.2150678,0.4935077
Sigmoid,0.5022158,0.4996056,0.5047816,0.4979036,0.5064834,0.4956986,0.2476538,0.4973931,0.5021936
