initial commit

mscsep · Jul 1, 2021 · 7c1195d · 7c1195d
1 parent a5d29db
commit 7c1195d
Show file tree

Hide file tree

Showing 12 changed files with 1,431 additions and 0 deletions.
diff --git a/OIC_meta.Rproj b/OIC_meta.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: No
+AlwaysSaveHistory: No
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
diff --git a/R/Flowchart.R b/R/Flowchart.R
@@ -0,0 +1,141 @@
+# Meta-analysis of the rodent object-in-context task: Get flowchart numbers
+# Written by Milou Sep
+
+rm(list=ls())
+library(dplyr)
+library(osfr) # to interact with Open Science Framework
+# instructions: https://github.com/CenterForOpenScience/osfr
+# Authenticate to OSF (see: http://centerforopenscience.github.io/osfr/articles/auth.html). Via osf_auth("PAT") in the commentline. (note PAT can be derived from OSF)
+library(readxl)
+
+
+# retrieve OSF files ------------------------------------------------------
+#search hits
+osf_retrieve_file("dvwhn") %>% osf_download(path = "data", conflicts="overwrite") # search hits search string thesis MV
+osf_retrieve_file("umz5k") %>% osf_download(path = "data", conflicts="overwrite") # search hits new search string (25.5.20)
+# Screening
+osf_retrieve_file("j7sfm") %>% osf_download(path = "data", conflicts="overwrite") # screening search string thesis MV
+osf_retrieve_file("bz3sj") %>% osf_download(path = "data", conflicts="overwrite") # screening new search string
+# data extraction file
+osf_retrieve_file("qadch") %>% osf_download(path = "data", conflicts="overwrite")
+
+# Load data ---------------------------------------------------------------
+# search documents (PMID's)
+search1.thesis <- read.table("data/hits.search.thesis.MV.txt",  quote="\"", comment.char="")
+search2.meta <- read.table("data/hits.new.search.meta.oic.v25.5.20.txt",  quote="\"", comment.char="")
+# screening documents
+screeing.search1 <-read.csv2("data/Screening S1 thesis search PMIDs.csv", na.strings = c(""))
+screeing.search2 <-read.csv2("data/Screening S2 new.in.new.search.PMIDs.csv", na.strings = c(""))
+# data extraction file
+read_excel("data/280121_Data_Extraction_RoB.xlsx", sheet = "Extraction_converted", na=c("N/A", "#VALUE!"))->data
+
+
+# Compare PMIDs -----------------------------------------------------------
+search1.thesis$V1[!search1.thesis$V1 %in% search2.meta$V1]-> not.in.new
+search2.meta$V1[!search2.meta$V1 %in% search1.thesis$V1]->new.in.new
+search2.meta$V1[search2.meta$V1 %in% search1.thesis$V1]->old.in.new
+
+# Count hits and screened papers
+nrow(search2.meta) #253
+nrow(search1.thesis) # 54
+nrow(screeing.search1) #  54
+length(new.in.new) #219
+nrow(screeing.search2) # 219 (so correct)
+length(not.in.new) # 20
+length(old.in.new) # 34
+#new in new + old in new
+219+34 # =253
+# not in new + old in new
+20+34 # = 54
+
+
+# Counts for Flowchart ----------------------------------------------------
+# Total
+screeing.search1 %>% filter(PMID %in% old.in.new) %>% nrow()#  34
+screeing.search2 %>% filter(PMID %in% new.in.new) %>% nrow() # 219
+34+219 #= 253
+
+
+# Counts search 1 (thesis) ------------------------------------------------
+# str(screeing.search1)
+
+screeing.search1 %>% 
+  filter(PMID %in% old.in.new) %>% 
+  filter(inclusie_july2020 == "yes") %>% nrow() # 30 included 
+
+screeing.search1 %>% 
+  filter(PMID %in% old.in.new) %>% 
+  filter(full.text.checked.july2020 == "no") %>% #nrow()# 26
+  filter(inclusie_july2020 == "yes") %>% nrow()# 26 included without full text check
+
+screeing.search1 %>% 
+  filter(PMID %in% old.in.new) %>%
+  filter(full.text.checked.july2020 == "yes") %>% #nrow()# 8 full text checked
+  filter(inclusie_july2020 == "yes") # of that 4 included (so 4 excluded)
+
+screeing.search1 %>% 
+  filter(PMID %in% old.in.new) %>% 
+  filter(inclusie_july2020 == "no?" | inclusie_july2020 == "no") # 4 excluded (reason for all: no OIC)
+
+
+# Counts search 2 (new) ---------------------------------------------------
+# str(screeing.search2)
+
+#total
+screeing.search2 %>% filter(PMID %in% new.in.new) %>%
+  filter(final.inclusion.screening == "Yes") %>% nrow() # 10 included
+
+#included without full text check
+screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
+  filter(full.text.checked.MS == "no") %>% #nrow()# 140 full text NOT checked
+  filter(final.inclusion.screening == "Yes") # of that 10 included   
+
+# excluded without full text check in s2 (+ reasons for exclusion)
+screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
+  filter(full.text.checked.MS == "no") %>% 
+  filter(final.inclusion.screening == "No") %>% #nrow()# 130
+  mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
+  select(Reason.for.exclusion) %>% table()
+
+# included after full text check
+screeing.search2 %>% filter(PMID %in% new.in.new) %>%
+  filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>% #nrow()# 79 full text checked
+  filter(final.inclusion.screening == "Yes") # of that 0 included
+
+# excluded after full text check in s2 (+ reason for exclusion)
+screeing.search2 %>% filter(PMID %in% new.in.new) %>%
+  filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>% 
+  filter(final.inclusion.screening == "No") %>% #nrow()   # 79 excluded
+  mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
+  select(Reason.for.exclusion) %>% table()
+
+
+
+# Unique inclusions thesis search + new search ----------------------------
+30 + 10 # total 40 inclusions following screening
+
+
+# Compare to numbers in data extraction file ------------------------------
+data %>% distinct(PMID) %>%nrow() # 41 studies
+
+
+# Difference inclusions vs data extraction: snowballing -------------------
+screeing.search1 %>%   filter(inclusie_july2020 == "yes") %>% select(PMID) ->inclusions.S1
+screeing.search2 %>%   filter(final.inclusion.screening == "Yes") %>% select(PMID)->inclusions.S2
+c(inclusions.S1 , inclusions.S2) %>% unlist() -> inclusions
+
+data %>% filter(!PMID %in% inclusions) # 1 identified via snowballing
+
+# To check: all inclusions from screening files are in dataset
+inclusions[(!inclusions %in% data$PMID)] 
+
+
+# Total inclusions --------------------------------------------------------
+# of these 41 studies, 4 studies are included in the systematic review but excluded for the meta-analyses due to missing data. Also see the prepare_data.rmd script.
+
+
+# save file local ---------------------------------------------------------
+# Note these files are also on OSF (https://osf.io/d9eu4/)
+write.table(not.in.new, file = "processed_data/not.in.new.search.csv", sep = ';', row.names = F)
+write.table(new.in.new, file = "processed_data/new.in.new.search.csv", sep = ';', row.names = F)
+write.table(old.in.new, file = "processed_data/old.in.new.search.csv", sep = ';', row.names = F)
diff --git a/R/MetaForest.Rmd b/R/MetaForest.Rmd
@@ -0,0 +1,148 @@
+---
+title: "Meta-analysis of the rodent object-in-context task: Random forest-based Meta-analysis (MetaForest)"
+author: "Milou Sep"
+date: "`r format(Sys.time(), '%d %B, %Y')`"
+output: html_document
+---
+
+# Prepare environment
+```{r setup, include=FALSE}
+rm(list = ls()) #clean environment
+# packages
+library(dplyr)
+library(caret) #for cross-validation
+library(metaforest) #for exploratory analysis
+# to check which packages are loaded
+# (.packages())
+```
+
+# Metaforest
+https://www.rdocumentation.org/packages/metaforest/versions/0.1.3
+
+## Data preparation
+```{r metaforest preparation}
+readRDS("processed_data/data_with_effect_size.RDS") ->data6
+str(data6)
+
+# create dataset for MetaForest
+data6 %>% 
+  select( -c(PMID, EXP_group, 
+             SmellCntrl, # only 1 level, so non-informative
+             nC,
+             # other outcome variables
+             DR, DR_A, DR_B, SEM, SEM_DR_A, SEM_DR_B, SD, label,
+             no.Affiliation, # captures mostly the same variation as `PMID` variable (no.Affiliation: 29 levels, PMID: 37 levels)
+             Program # captures mostly the same variation as in the 'scoring' variable.
+  )) %>% 
+  droplevels() -> datForest
+is.na(datForest) %>% any()
+```
+
+## Save data for WS follow-up plots
+```{r save data WS plots}
+data6 %>% select(nC, ID, PMID)->sample.sizes
+full_join(sample.sizes, datForest, by="ID")->datForest_for_WS_Plot
+saveRDS(datForest_for_WS_Plot, "processed_data/datForest_for_WS_Plot.RDS")
+```
+
+## MetaForest: Tuning
+Note, tuning assumes that convergence is reached. To set-up metaforest, also see: https://www.rdocumentation.org/packages/metaforest/versions/0.1.3/topics/ModelInfo_mf
+```{r metaforest tuning}
+set.seed(94372)
+ModelInfo_mf()$notes # information on method from MetaForest
+# Set up 9-fold grouped cross-validation
+fit_control <- trainControl(method = "cv", number = 9, index = groupKFold(datForest$ID, k = 9)) 
+# Set up a custom tuning grid for the three tuning parameters of MetaForest
+rf_grid <- expand.grid(whichweights = c("random", "fixed", "unif"),
+                       mtry = c(2, 4, 6),
+                       min.node.size = c(2, 4, 6))
+```
+
+## MetaForest: Train Model
+Cross-validated clustered MetaForest
+```{r MetaForest training}
+set.seed(94372)
+# Train the model
+cv.mf.cluster <- train(y = datForest$yi, x = datForest[names(datForest) !="yi"],
+                       study = "ID", 
+                       method = ModelInfo_mf(), #= the method from MetaForest. (https://rdrr.io/cran/metaforest/man/MetaForest.html)
+                       trControl = fit_control,
+                       tuneGrid = rf_grid,
+                       num.trees = 600,
+                       data=datForest)
+# Save the model
+saveRDS(cv.mf.cluster,file="processed_data/fitted.MetaForest.RDS")
+```
+
+To load saved model directly
+```{r eval=FALSE, include=FALSE}
+readRDS("processed_data/fitted.MetaForest.RDS")->cv.mf.cluster
+```
+
+## MetaForest: Inspect model
+
+### Tuning parameters
+#### Show model: The final values used for the model were whichweights = unif, mtry = 4 and min.node.size = 2
+```{r inspect model 1}
+cv.mf.cluster
+```
+#### Plot grid parameter models
+```{r inspect model 2}
+plot(cv.mf.cluster)
+```
+
+### Model Summary
+```{r inspect model 3}
+summary(cv.mf.cluster)
+```
+
+### R(oob)
+https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710
+```{r inspect model 4}
+cv.mf.cluster$finalModel
+```
+
+### Cross validated R2 with SD
+details for the "best" model: unif, mtry = 4 and min.node.size = 2
+```{r inspect model 5}
+cv.mf.cluster$results[which(cv.mf.cluster$results$whichweights == "unif" & 
+                              cv.mf.cluster$results$mtry == 4 & 
+                              cv.mf.cluster$results$min.node.size == 2),] 
+```
+
+### Convergence plot
+Good model convergence
+```{r convergence plot}
+jpeg(file="results/metaforest_convergencePlot.jpeg", height = 3000, width = 3000, res=500)
+plot(cv.mf.cluster$finalModel) 
+dev.off()
+```
+
+## Variable importance (based on metaForest)
+```{r variable importance plot}
+tiff(file="results/metaforest_varImportance.tiff", height=1500, width=1500, pointsize=8, res=300)
+VarImpPlot(cv.mf.cluster$finalModel)+
+  geom_hline(yintercept=11.5) +
+  annotate(geom="text", x=0.0017, y=12.5, label="Top 50%", color="darkblue")
+dev.off()
+```
+
+## Export important variables
+```{r metaforest varimportance scores}
+varImp(cv.mf.cluster) ->imp.scores
+
+# Export variable importance scores (ordered) to csv
+imp <- as.data.frame(imp.scores$importance)
+imp <- data.frame(overall = imp$Overall,
+                  names   = rownames(imp))
+write.csv2(imp[order(imp$overall,decreasing = T),],"results/important_variables_metaforest.csv")
+
+# Select top 50% most important variables
+imp[order(imp$overall,decreasing = T),] %>%
+  slice_head(.,n=nrow(.) / 2) -> important.vars 
+important.vars$names ->variable
+variable
+
+# Save top 50% most important variables
+saveRDS(variable,"processed_data/important_variables.RDS")
+```