In [None]:
# Load essential libraries
knitr::opts_chunk$set(echo = TRUE)
library(jsonlite)
library(dplyr)
library(tidyr)
library(ggplot2)
if (!require('GGally')) install.packages('GGally'); library(GGally)


## Get study IDs associated with Bio-Scales sites using API



In [None]:
base_url = "https://api.microbiomedata.org"
url = paste0(base_url, "/studies?filter=title.search:Bio-Scales")

response = fromJSON(url)
study_ids = response[["results"]][["id"]]
print(study_ids)


## Using the study ids, pull out bio sample IDs 

Note that we are pulling 100 records at a time until we have retrieved all biosamples for the three study ids above, place the data retrieved for each bio sample into a tibble.


In [None]:
study_id = study_ids[1]

# Prepare a tibble to hold results
dat_all = tibble()

# Set up query
per_page = 100  # number of records to retrieve per page
filt = paste0("part_of:", study_id) # filter to only get biosamples from the study of interest
get_more = TRUE # flag to indicate whether we need to get more records
page = 1 # page number to retrieve
while (get_more){
    # construct the url for the query
    url = paste0(
        base_url, 
        "/biosamples?filter=", 
        filt,
        "&per_page=",
        per_page,
        "&page=",
        page)
    # get the data
    data = fromJSON(url)
    data_results = data[['results']] %>% as.data.frame() 
    # add the data to the tibble
    dat_all = bind_rows(dat_all, data_results)
    # check if we need to get more records
    if (nrow(dat_all) < data[['meta']]['count']){
        page = page +1
    } else { get_more = FALSE}
}

glimpse(dat_all)


## Clean up results
Pull out biosample_id and associated chemical metadata; unnest as needed


In [None]:
df <- dat_all %>%
    select(
        id, ecosystem_subtype,
        calcium, magnesium, potassium, nitrate_nitrogen, manganese, zinc
    ) %>%
    unnest(
        cols = c(
            ecosystem_subtype,
            calcium, magnesium, potassium, nitrate_nitrogen, manganese, zinc
        ), names_sep = "_") %>%
    select(id, ecosystem_subtype, 
           contains("has_numeric_value"),
           contains("has_unit"))
glimpse(df)


## Unit check
Check that all units are the same before dropping from dataframe


In [None]:
# Check that all units are the same before dropping from dataframe
unit_check <- df %>%
    select(contains("has_unit")) %>%
    distinct() 

glimpse(unit_check)


## More dataframe cleaning
Since all units are the same, drop from dataframe; rename columns for easier reading and plotting


In [None]:
df <- df %>%
    select(-contains("has_unit")) %>%
    rename(
        calcium = calcium_has_numeric_value,
        magnesium = magnesium_has_numeric_value,
        potassium = potassium_has_numeric_value,
        nitrate = nitrate_nitrogen_has_numeric_value,
        manganese = manganese_has_numeric_value,
        zinc = zinc_has_numeric_value
    ) 
glimpse(df)


## Plot chemical data in a correlation matrix
Create paired correlation matrix using GGally package's [ggpairs function](https://ggobi.github.io/ggally/articles/ggpairs.html)


In [None]:
g <- ggpairs(df, 
        columns = c(3:7), 
        title = "Correlation Matrix of Chemicals in Bio-Scales Data",
        lower = list(continuous = wrap("points", alpha = 0.5, size = 0.7)),
        upper = list(continuous = wrap("cor", size = 3))) +
    theme_bw() +
    labs(
        x = "Chemical Concentration (mg/kg)",
        y = "Chemical Concentration (mg/kg)"
    )
g
