# CNV Boosting

In [1]:
library(ada)
library(dplyr)

Loading required package: rpart

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
library(tictoc)

In [3]:
gdsc = readRDS('data/gdsc_screened.rds')
cnv = readRDS('data/cnNEW.rds')

In [4]:
names(cnv) <- make.names(names(cnv))

In [5]:
get.t <- function(dat, labs){
    # split the data into effective and ineffective
    effect <- dat[labs]
    ineffect <- dat[!labs]
    
    # calculate the two sample means
    effect.bar <- mean(effect)
    ineffect.bar <- mean(ineffect)
    
    # calculate the two sample variances
    v.effect <- var(effect)
    v.ineffect <- var(ineffect)
    
    # calculate the sample sizes
    n.effect <- length(effect)
    n.ineffect <- length(ineffect)
    
    # calculate the sd
    s <- sqrt((v.effect/n.effect) + (v.ineffect/n.ineffect))
    
    # calculate the test statistic
    T <- (effect.bar - ineffect.bar)/s
    
    # calculate the degrees of freedom
    # df = ((v.effect/n.effect+v.ineffect/n.ineffect)^2)/(v.effect^2/(n.effect^2 * (n.effect-1))+v.ineffect^2/(n.ineffect^2 * (n.ineffect-1)))
    
    # compare our t value and the threshold, decide whether we should keep it or not
    return(abs(T))
}

In [6]:
outcome <- data.frame(Gene = colnames(cnv))

In [None]:
tic()
codes <- c("1006","1007","1008","1011","1014","1016","1019","1026","1032","1054","1060","1062")
resultmat = matrix(nrow=12,ncol=5)
rownames(resultmat) = c("1006","1007","1008","1011","1014","1016","1019","1026","1032","1054","1060","1062")
colnames(resultmat) = c("50","100","200","500","1000")
for (code in codes){
  print(code)
  gdsc_sub <- subset(gdsc, DRUG_ID_lib == code)[,c("CL","EFFECT")]
  cnv_sub <- cnv[as.character(gdsc_sub$CL),]
  cnv_sub_full <- cbind(cnv_sub, gdsc_sub['EFFECT'])
  # outcome$T <- apply(cnv_sub,2,get.t,gdsc_sub$EFFECT)
  n_features = c(50,100,200,500,1000)
  train_accuracy = c()
  test_accuracy = c()
  for (n_feature in n_features){
    print(n_feature)
    train_error = c()
    test_error = c()
    for (j in 1:5){  
      set.seed(j)
      train_id=sample(1:nrow(cnv_sub_full),0.7*nrow(cnv_sub_full))
      train_cnv = cnv_sub_full[train_id,]
      test_cnv = cnv_sub_full[-train_id,]
      outcome$T <- apply(train_cnv[,-ncol(train_cnv)], 2, get.t, train_cnv$EFFECT)
      subset_id = outcome %>% top_n(n_feature,T) %>% select(Gene)
      subset_id = unname(t(subset_id[,1]))
      train_cnv_reduced = cbind(train_cnv[,subset_id],train_cnv['EFFECT'])
      test_cnv_reduced = cbind(test_cnv[,subset_id],test_cnv['EFFECT'])
      # train_cnv_reduced = as.data.frame(train_cnv_reduced)
      # test_cnv_reduced = as.data.frame(test_cnv_reduced)
      #cn.col.name <- gsub("[-]", ".", colnames(train_cnv_reduced))
      #colnames(train_cnv_reduced) <- cn.col.name
      #colnames(test_cnv_reduced) <- cn.col.name
      train_cnv_reduced$EFFECT <- as.factor(train_cnv_reduced$EFFECT)
      test_cnv_reduced$EFFECT <- as.factor(test_cnv_reduced$EFFECT)
      myboost = ada(EFFECT~.,data=train_cnv_reduced,iter=5)
      #train_pred = predict(myforest,train_cnv)
      test_pred = predict(myboost,test_cnv_reduced)
      #train_error[j] = mean(train_pred!=train_cnv[['EFFECT']])
      test_error[j] = mean(test_pred!=test_cnv_reduced[['EFFECT']])
  }
  resultmat[code,toString(n_feature)] = 1-mean(test_error)
  }
}
toc()