Skip to content

Commit

Permalink
initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
mscsep committed Jul 1, 2021
1 parent a5d29db commit 7c1195d
Show file tree
Hide file tree
Showing 12 changed files with 1,431 additions and 0 deletions.
13 changes: 13 additions & 0 deletions OIC_meta.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: No
AlwaysSaveHistory: No

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
141 changes: 141 additions & 0 deletions R/Flowchart.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
# Meta-analysis of the rodent object-in-context task: Get flowchart numbers
# Written by Milou Sep

rm(list=ls())
library(dplyr)
library(osfr) # to interact with Open Science Framework
# instructions: https://github.com/CenterForOpenScience/osfr
# Authenticate to OSF (see: http://centerforopenscience.github.io/osfr/articles/auth.html). Via osf_auth("PAT") in the commentline. (note PAT can be derived from OSF)
library(readxl)


# retrieve OSF files ------------------------------------------------------
#search hits
osf_retrieve_file("dvwhn") %>% osf_download(path = "data", conflicts="overwrite") # search hits search string thesis MV
osf_retrieve_file("umz5k") %>% osf_download(path = "data", conflicts="overwrite") # search hits new search string (25.5.20)
# Screening
osf_retrieve_file("j7sfm") %>% osf_download(path = "data", conflicts="overwrite") # screening search string thesis MV
osf_retrieve_file("bz3sj") %>% osf_download(path = "data", conflicts="overwrite") # screening new search string
# data extraction file
osf_retrieve_file("qadch") %>% osf_download(path = "data", conflicts="overwrite")

# Load data ---------------------------------------------------------------
# search documents (PMID's)
search1.thesis <- read.table("data/hits.search.thesis.MV.txt", quote="\"", comment.char="")
search2.meta <- read.table("data/hits.new.search.meta.oic.v25.5.20.txt", quote="\"", comment.char="")
# screening documents
screeing.search1 <-read.csv2("data/Screening S1 thesis search PMIDs.csv", na.strings = c(""))
screeing.search2 <-read.csv2("data/Screening S2 new.in.new.search.PMIDs.csv", na.strings = c(""))
# data extraction file
read_excel("data/280121_Data_Extraction_RoB.xlsx", sheet = "Extraction_converted", na=c("N/A", "#VALUE!"))->data


# Compare PMIDs -----------------------------------------------------------
search1.thesis$V1[!search1.thesis$V1 %in% search2.meta$V1]-> not.in.new
search2.meta$V1[!search2.meta$V1 %in% search1.thesis$V1]->new.in.new
search2.meta$V1[search2.meta$V1 %in% search1.thesis$V1]->old.in.new

# Count hits and screened papers
nrow(search2.meta) #253
nrow(search1.thesis) # 54
nrow(screeing.search1) # 54
length(new.in.new) #219
nrow(screeing.search2) # 219 (so correct)
length(not.in.new) # 20
length(old.in.new) # 34
#new in new + old in new
219+34 # =253
# not in new + old in new
20+34 # = 54


# Counts for Flowchart ----------------------------------------------------
# Total
screeing.search1 %>% filter(PMID %in% old.in.new) %>% nrow()# 34
screeing.search2 %>% filter(PMID %in% new.in.new) %>% nrow() # 219
34+219 #= 253


# Counts search 1 (thesis) ------------------------------------------------
# str(screeing.search1)

screeing.search1 %>%
filter(PMID %in% old.in.new) %>%
filter(inclusie_july2020 == "yes") %>% nrow() # 30 included

screeing.search1 %>%
filter(PMID %in% old.in.new) %>%
filter(full.text.checked.july2020 == "no") %>% #nrow()# 26
filter(inclusie_july2020 == "yes") %>% nrow()# 26 included without full text check

screeing.search1 %>%
filter(PMID %in% old.in.new) %>%
filter(full.text.checked.july2020 == "yes") %>% #nrow()# 8 full text checked
filter(inclusie_july2020 == "yes") # of that 4 included (so 4 excluded)

screeing.search1 %>%
filter(PMID %in% old.in.new) %>%
filter(inclusie_july2020 == "no?" | inclusie_july2020 == "no") # 4 excluded (reason for all: no OIC)


# Counts search 2 (new) ---------------------------------------------------
# str(screeing.search2)

#total
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
filter(final.inclusion.screening == "Yes") %>% nrow() # 10 included

#included without full text check
screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
filter(full.text.checked.MS == "no") %>% #nrow()# 140 full text NOT checked
filter(final.inclusion.screening == "Yes") # of that 10 included

# excluded without full text check in s2 (+ reasons for exclusion)
screeing.search2 %>% filter(PMID %in% new.in.new) %>% #View()
filter(full.text.checked.MS == "no") %>%
filter(final.inclusion.screening == "No") %>% #nrow()# 130
mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
select(Reason.for.exclusion) %>% table()

# included after full text check
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>% #nrow()# 79 full text checked
filter(final.inclusion.screening == "Yes") # of that 0 included

# excluded after full text check in s2 (+ reason for exclusion)
screeing.search2 %>% filter(PMID %in% new.in.new) %>%
filter(full.text.checked.MS == "yes"| is.na(full.text.checked.MS)) %>%
filter(final.inclusion.screening == "No") %>% #nrow() # 79 excluded
mutate(Reason.for.exclusion = factor(Reason.for.exclusion))%>%
select(Reason.for.exclusion) %>% table()



# Unique inclusions thesis search + new search ----------------------------
30 + 10 # total 40 inclusions following screening


# Compare to numbers in data extraction file ------------------------------
data %>% distinct(PMID) %>%nrow() # 41 studies


# Difference inclusions vs data extraction: snowballing -------------------
screeing.search1 %>% filter(inclusie_july2020 == "yes") %>% select(PMID) ->inclusions.S1
screeing.search2 %>% filter(final.inclusion.screening == "Yes") %>% select(PMID)->inclusions.S2
c(inclusions.S1 , inclusions.S2) %>% unlist() -> inclusions

data %>% filter(!PMID %in% inclusions) # 1 identified via snowballing

# To check: all inclusions from screening files are in dataset
inclusions[(!inclusions %in% data$PMID)]


# Total inclusions --------------------------------------------------------
# of these 41 studies, 4 studies are included in the systematic review but excluded for the meta-analyses due to missing data. Also see the prepare_data.rmd script.


# save file local ---------------------------------------------------------
# Note these files are also on OSF (https://osf.io/d9eu4/)
write.table(not.in.new, file = "processed_data/not.in.new.search.csv", sep = ';', row.names = F)
write.table(new.in.new, file = "processed_data/new.in.new.search.csv", sep = ';', row.names = F)
write.table(old.in.new, file = "processed_data/old.in.new.search.csv", sep = ';', row.names = F)
148 changes: 148 additions & 0 deletions R/MetaForest.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
---
title: "Meta-analysis of the rodent object-in-context task: Random forest-based Meta-analysis (MetaForest)"
author: "Milou Sep"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: html_document
---

# Prepare environment
```{r setup, include=FALSE}
rm(list = ls()) #clean environment
# packages
library(dplyr)
library(caret) #for cross-validation
library(metaforest) #for exploratory analysis
# to check which packages are loaded
# (.packages())
```

# Metaforest
https://www.rdocumentation.org/packages/metaforest/versions/0.1.3

## Data preparation
```{r metaforest preparation}
readRDS("processed_data/data_with_effect_size.RDS") ->data6
str(data6)
# create dataset for MetaForest
data6 %>%
select( -c(PMID, EXP_group,
SmellCntrl, # only 1 level, so non-informative
nC,
# other outcome variables
DR, DR_A, DR_B, SEM, SEM_DR_A, SEM_DR_B, SD, label,
no.Affiliation, # captures mostly the same variation as `PMID` variable (no.Affiliation: 29 levels, PMID: 37 levels)
Program # captures mostly the same variation as in the 'scoring' variable.
)) %>%
droplevels() -> datForest
is.na(datForest) %>% any()
```

## Save data for WS follow-up plots
```{r save data WS plots}
data6 %>% select(nC, ID, PMID)->sample.sizes
full_join(sample.sizes, datForest, by="ID")->datForest_for_WS_Plot
saveRDS(datForest_for_WS_Plot, "processed_data/datForest_for_WS_Plot.RDS")
```

## MetaForest: Tuning
Note, tuning assumes that convergence is reached. To set-up metaforest, also see: https://www.rdocumentation.org/packages/metaforest/versions/0.1.3/topics/ModelInfo_mf
```{r metaforest tuning}
set.seed(94372)
ModelInfo_mf()$notes # information on method from MetaForest
# Set up 9-fold grouped cross-validation
fit_control <- trainControl(method = "cv", number = 9, index = groupKFold(datForest$ID, k = 9))
# Set up a custom tuning grid for the three tuning parameters of MetaForest
rf_grid <- expand.grid(whichweights = c("random", "fixed", "unif"),
mtry = c(2, 4, 6),
min.node.size = c(2, 4, 6))
```

## MetaForest: Train Model
Cross-validated clustered MetaForest
```{r MetaForest training}
set.seed(94372)
# Train the model
cv.mf.cluster <- train(y = datForest$yi, x = datForest[names(datForest) !="yi"],
study = "ID",
method = ModelInfo_mf(), #= the method from MetaForest. (https://rdrr.io/cran/metaforest/man/MetaForest.html)
trControl = fit_control,
tuneGrid = rf_grid,
num.trees = 600,
data=datForest)
# Save the model
saveRDS(cv.mf.cluster,file="processed_data/fitted.MetaForest.RDS")
```

To load saved model directly
```{r eval=FALSE, include=FALSE}
readRDS("processed_data/fitted.MetaForest.RDS")->cv.mf.cluster
```

## MetaForest: Inspect model

### Tuning parameters
#### Show model: The final values used for the model were whichweights = unif, mtry = 4 and min.node.size = 2
```{r inspect model 1}
cv.mf.cluster
```
#### Plot grid parameter models
```{r inspect model 2}
plot(cv.mf.cluster)
```

### Model Summary
```{r inspect model 3}
summary(cv.mf.cluster)
```

### R(oob)
https://towardsdatascience.com/what-is-out-of-bag-oob-score-in-random-forest-a7fa23d710
```{r inspect model 4}
cv.mf.cluster$finalModel
```

### Cross validated R2 with SD
details for the "best" model: unif, mtry = 4 and min.node.size = 2
```{r inspect model 5}
cv.mf.cluster$results[which(cv.mf.cluster$results$whichweights == "unif" &
cv.mf.cluster$results$mtry == 4 &
cv.mf.cluster$results$min.node.size == 2),]
```

### Convergence plot
Good model convergence
```{r convergence plot}
jpeg(file="results/metaforest_convergencePlot.jpeg", height = 3000, width = 3000, res=500)
plot(cv.mf.cluster$finalModel)
dev.off()
```

## Variable importance (based on metaForest)
```{r variable importance plot}
tiff(file="results/metaforest_varImportance.tiff", height=1500, width=1500, pointsize=8, res=300)
VarImpPlot(cv.mf.cluster$finalModel)+
geom_hline(yintercept=11.5) +
annotate(geom="text", x=0.0017, y=12.5, label="Top 50%", color="darkblue")
dev.off()
```

## Export important variables
```{r metaforest varimportance scores}
varImp(cv.mf.cluster) ->imp.scores
# Export variable importance scores (ordered) to csv
imp <- as.data.frame(imp.scores$importance)
imp <- data.frame(overall = imp$Overall,
names = rownames(imp))
write.csv2(imp[order(imp$overall,decreasing = T),],"results/important_variables_metaforest.csv")
# Select top 50% most important variables
imp[order(imp$overall,decreasing = T),] %>%
slice_head(.,n=nrow(.) / 2) -> important.vars
important.vars$names ->variable
variable
# Save top 50% most important variables
saveRDS(variable,"processed_data/important_variables.RDS")
```

0 comments on commit 7c1195d

Please sign in to comment.