In [2]:
library(data.table)

### Performance Function

In [3]:
perf_dt=function(type,actual,forecast){
    name=type
    n=length(actual)
    error=actual-forecast
    mean=mean(actual)
    sd=sd(actual)
    FBias=sum(error)/sum(actual)
    MPE=sum(error/actual)/n
    MAPE=sum(abs(error/actual))/n
    RMSE=sqrt(sum(error^2))/n
    MAD=sum(abs(error))/n
    WMAPE=MAD/mean
    l=data.frame(name,n,mean,sd,FBias,MAPE,RMSE,MAD,WMAPE)
    return(l)
}

# Dataset

In [4]:
data_train=fread("IE582_Fall20_ProjectTrain.csv")
data_train$y=as.factor(data_train$y)

In [5]:
levels(data_train$y)=c("No","Yes")

In [8]:
library(caTools)

In [9]:
spl=sample.split(data_train$y, SplitRatio = 0.8)
train=subset(data_train,spl==TRUE)
test=subset(data_train,spl==FALSE)
str(train)

Classes 'data.table' and 'data.frame':	1659 obs. of  61 variables:
 $ x1 : int  27 30 37 29 33 33 29 27 27 29 ...
 $ x2 : int  1 0 0 0 1 0 1 1 0 1 ...
 $ x3 : int  1 1 1 1 1 0 0 1 1 0 ...
 $ x4 : int  1 1 1 1 0 1 1 1 0 0 ...
 $ x5 : int  18 18 1 14 2 5 16 13 8 3 ...
 $ x6 : int  3 13 3 9 15 5 1 4 18 14 ...
 $ x7 : int  1 3 14 3 12 12 2 17 18 1 ...
 $ x8 : int  28 19 33 29 39 26 24 34 26 24 ...
 $ x9 : num  119.9 86.7 174 8.8 55 ...
 $ x10: num  154 133 128 127 188 ...
 $ x11: num  121.4 129 100.2 55.5 156.6 ...
 $ x12: int  1 0 0 1 1 0 0 0 1 0 ...
 $ x13: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x14: int  404 303 454 383 404 404 404 30 404 454 ...
 $ x15: int  1 1 1 1 0 1 0 1 1 1 ...
 $ x16: int  0 0 0 1 0 0 0 0 0 0 ...
 $ x17: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x18: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x19: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x20: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x21: int  0 0 0 0 0 0 0 0 0 0 ...
 $ x22: int  0 1 0 0 0 0 0 0 0 0 ...
 $ x23: int  1 1 1 0 1 0 1 0 0 1 ...
 $ x24: int  0 0 0 1

# Model

In [12]:
library(xlsx)
library(caret)
library(e1071)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(Information)
library(FactoMineR)
library(caTools)
library(ROCR)
library(pROC)

# Metric Creation

In [13]:
library(yardstick)
library(mlbench)
library(cvAUC)
library(forecast)

### Custom Metric

In [14]:
fourStats <- function (data, lev = levels(data$obs), model = NULL) { 
      auc_val=AUC(data$Yes, data$obs)
      
      out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
      coords <- matrix(c(1, 1, out["Spec"], out["Sens"]), 
                   ncol = 2, 
                   byrow = TRUE)
      #print(out)  
  #print(BER(data$obs, data$pred))
      colnames(coords) <- c("Spec", "Sens")
      rownames(coords) <- c("Best", "Current") 
      a=as.numeric((auc_val+(coords[2]+coords[4])/2)/2)+coords[2]/10
      c(AUC_value=auc_val, Prime=a, Spec=coords[4], Sens=coords[2])
}

## Grid Tpes

#### Random Forest

In [15]:
rf_grid=expand.grid(mtry=c(8,10,12,15,20),
                   splitrule = c("extratrees","gini","hellinger"),
                   min.node.size= c(5,10,15,20,25))

#### SGB

In [16]:
gbmGrid=expand.grid(interaction.depth = c(1,3,5,8), 
                        n.trees = (1:6)*50, 
                        shrinkage = c(0.1, 0.05, 0.01),
                        n.minobsinnode = c(5,10,15))

#### XGBoost

In [17]:
xgbGrid=expand.grid(max_depth = c(3, 5), 
                        nrounds = (2:4)*50, 
                        eta = c(0.1, 0.3),
                        min_child_weight=c(1,5),
                        gamma=c( 1, 1.5, 2),
                        colsample_bytree=c(0.8),
                        subsample=c(0.8))

### Cross-Validation

In [18]:
fitControl_random=trainControl(method = "repeatedcv",
                               number = 10, repeats=4, summaryFunction = fourStats,
                              classProbs = T, 
                              verboseIter=FALSE)

# Random Forest

In [19]:
random_forest=train(y ~. , data = train, 
                 method = "ranger", 
                 trControl = fitControl_random, num.trees=400,
                 metric="Prime",
                 maximize=TRUE,
                 tuneGrid = rf_grid,
                 class.weights  = c(sum(data_train$y=="No")/nrow(data_train),sum(data_train$y=="Yes")/nrow(data_train)))

In [20]:
random_forest

Random Forest 

1659 samples
  60 predictor
   2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 4 times) 
Summary of sample sizes: 1493, 1493, 1493, 1494, 1493, 1494, ... 
Resampling results across tuning parameters:

  mtry  splitrule   min.node.size  AUC_value  Prime      Spec       Sens     
   8    extratrees   5             0.8636577  0.8366484  0.9354762  0.4884299
   8    extratrees  10             0.8646756  0.8367744  0.9364841  0.4866159
   8    extratrees  15             0.8640125  0.8342771  0.9354825  0.4811433
   8    extratrees  20             0.8642328  0.8377250  0.9368825  0.4896799
   8    extratrees  25             0.8631778  0.8349698  0.9366794  0.4834604
   8    gini         5             0.8776209  0.8573029  0.9504730  0.5167835
   8    gini        10             0.8791924  0.8574568  0.9548603  0.5118445
   8    gini        15             0.8785575  0.8571593  0.9540651  0.5124695
   8    gini        20             0.87

In [22]:
pred_rf=predict(random_forest, test,type="prob")

In [23]:
sum(pred_rf$Yes<0.5)/nrow(pred_rf)

In [24]:
max(pred_rf$Yes)

In [25]:
pred_rf[,c(2)]=pred_rf[,c(2)]+(1-max(pred_rf[,c(2)]))

In [26]:
pred_rf=pred_rf[,2]

In [27]:
sum(pred_rf<0.5)/length(pred_rf)

# SGM

In [28]:
data_train_sgm=as.data.frame(train)
data_test_sgm=as.data.frame(test)

In [29]:
data_train_sgm=data_train_sgm[,c(-37,-50,-52)]
data_test_sgm=data_test_sgm[,c(-37,-50,-52)]

In [30]:
gbm_dt=train(y ~ ., data = data_train_sgm, 
                 method = "gbm", 
                 trControl = fitControl_random,
                 metric="Prime",
                 tuneGrid = gbmGrid,
                 verbose=F)

In [31]:
gbm_dt

Stochastic Gradient Boosting 

1659 samples
  57 predictor
   2 classes: 'No', 'Yes' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 4 times) 
Summary of sample sizes: 1494, 1492, 1493, 1493, 1493, 1494, ... 
Resampling results across tuning parameters:

  shrinkage  interaction.depth  n.minobsinnode  n.trees  AUC_value  Prime    
  0.01       1                   5               50      0.8318908  0.6659454
  0.01       1                   5              100      0.8494277  0.7071582
  0.01       1                   5              150      0.8614143  0.7290936
  0.01       1                   5              200      0.8672251  0.7396910
  0.01       1                   5              250      0.8700071  0.7637422
  0.01       1                   5              300      0.8719797  0.7828326
  0.01       1                  10               50      0.8333187  0.6666593
  0.01       1                  10              100      0.8486796  0.7070082
  0.01       1          

### Prediction

In [32]:
pred_gbm=predict(gbm_dt, data_test_sgm,type="prob")

In [33]:
pred_gbm=pred_gbm[,2]

In [34]:
sum(pred_gbm<0.5)/length(pred_gbm)

In [35]:
pred_gbm=pred_gbm+(1-max(pred_gbm))

In [36]:
max(pred_gbm)

In [37]:
sum(pred_gbm<0.5)/length(pred_gbm)

# XGBoost

In [38]:
xgboost_dt=train(y ~ ., data = train, 
                 method = "xgbTree", 
                 trControl = fitControl_random,
                 metric="Prime",  
                 tuneGrid = xgbGrid,
                 weight=c(sum(data_train$y=="No")/nrow(data_train),sum(data_train$y=="Yes")/nrow(data_train)),
                 verbose=F)

Parameters: { weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoo

In [39]:
pred_xgb=predict(xgboost_dt, test, type="prob")

In [40]:
pred_xgb=pred_xgb[,2]

In [41]:
sum(pred_xgb<0.5)/length(pred_xgb)

In [42]:
pred_xgb=pred_xgb+(1-max(pred_xgb))

In [43]:
max(pred_xgb)

In [44]:
test$y=as.numeric(as.factor(test$y))-1

In [46]:
library(xlsx)

In [48]:
all_results_analysis=data.table(random_forest=pred_rf,
                                stochastic=pred_gbm,
                                xgboost_res=pred_xgb,
                                actual_val=test$y)

In [51]:
write.xlsx(all_results_analysis,"model_control.xlsx")

### Prediction

In [69]:
last_num=(pred_xgb+pred_rf+pred_gbm)/3

In [70]:
sum(last_num<0.5)/length(last_num)

In [73]:
sum(last_num<0.5)/length(last_num)

In [72]:
manys=0
for(i in 1:length(last_num)){
    if(last_num[i]<=0.5){
        count=0
        count1=0
        sum=0
        sum1=0
        if(pred_rf[i]>0.35){
            sum=pred_rf[i]+sum+0.155
            count=count+1
        }
        if(pred_gbm[i]>0.35){
            sum=pred_gbm[i]+sum+0.155
            count=count+1
        }
        if(pred_xgb[i]>0.35){
            sum=pred_xgb[i]+sum+0.155
            count=count+1
        }
        else if(pred_rf[i]>0.22 & pred_gbm[i]>0.22){
            sum1=(pred_rf[i]+pred_gbm[i])/2+sum1+0.285  
            count1=count1+1
        } 
        else if(pred_gbm[i]>0.22 & pred_xgb[i]>0.22){
            sum1=(pred_gbm[i]+pred_xgb[i])/2+sum1+0.285 
            count1=count1+1
        } 
        else if(pred_rf[i]>0.22 & pred_xgb[i]>0.22){
            sum1=(pred_rf[i]+pred_xgb[i])/2+sum1+0.285    
            count1=count1+1
        } 
#         else if(pred_rf[i]>0.20 & pred_gbm[i]>0.20 & pred_xgb[i]>0.20){
#             last_num[i]=((pred_rf[i]+pred_gbm[i]+pred_xgb[i])/3)+0.305     
#         } 
        
        if(count>0){
            manys=manys+(count/count)
            last_num[i]=sum/count
        }   
        if(count1>0){
            manys=manys+(count1/count1)
            last_num[i]=sum1/count1
        }  
    }
}

In [77]:
res=data.table(last_exp=last_num)

In [78]:
write.xlsx(res,"res_cont.xlsx")

In [79]:
manys

In [183]:
sum(data_train$y=="No")/nrow(data_train)

In [184]:
sum(last_num<0.5)/length(last_num)

# Submission

In [185]:
send_submission(last_num, token, url=subm_url, submit_now= submit_now)

[1] "Format OK"
$submission
[0.0243,0.1196,0.5796,0.0206,0.7544,0.0983,0.5166,0.5507,0.0586,0.0165,0.1557,0.1847,0.0221,0.982,0.1493,0.0211,0.0138,0.0753,0.1147,0.0192,0.0431,0.5292,0.9907,0.0152,0.6471,0.0383,0.507,0.6167,0.5269,0.2041,0.5684,0.7552,0.0163,0.5845,0.6334,0.1548,0.8487,0.9277,0.974,0.0196,0.0734,0.1079,0.0935,0.7272,0.1599,0.7577,0.1514,0.018,0.1296,0.0409,0.052,0.0182,0.1712,0.5487,0.9472,0.532,0.6582,0.5614,0.0658,0.5269,0.9829,0.0107,0.0263,0.1141,0.0446,0.0303,0.0113,0.017,0.8018,0.0949,0.7306,0.0138,0.0227,0.8138,0.0217,0.1187,0.5215,0.0444,0.5181,0.044,0.9588,0.0367,0.0104,0.6275,0.1356,0.1807,0.0136,0.0586,0.926,0.205,0.0403,0.0323,0.0699,0.8609,0.0983,0.2023,0.0546,0.0485,0.5135,0.6137,0.7175,0.0233,0.6119,0.5313,0.5327,0.577,0.0348,0.2179,0.1184,0.0554,0.0295,0.7963,0.0486,0.5122,0.1848,0.0131,0.0105,0.2348,0.034,0.0231,0.0325,0.0329,0.0192,0.061,0.072,0.0166,0.0229,0.2285,0.6949,0.0278,0.5968,0.2275,0.1644,0.1936,0.58,0.501,0.0401,0.5029,0.052,0.5196,0.547,0.1