# Example 1

Plot pH over Time for the NEON Studies.

In [1]:
import requests
import json
import pandas as pd
import folium
import altair as alt
from altair import Chart
from datetime import datetime

In [41]:
# Get the Study ID for the NEON soil metagenome DP1.10107.001 project
base_url = "https://api.microbiomedata.org"
filt = "name.search:DP1.10107.001"
url = f"{base_url}/studies?filter={filt}"
resp = requests.get(url)
studies = resp.json()['results']
study_id = []
for study in studies:
    study_id.append(study['id'])
# Since there is only one value in the results, convert list to a string
study = ''.join(study_id)
print(study)

nmdc:sty-11-34xj1150


In [42]:
# Get all biosamples from the above studies using the biosamples endpoint and filter based on study id.
# Select only fields of interest (e.g. ph, water_content, env_package, etc.)
# Print the total number of biosamples
per_page = 2000
fields = "ph,collection_date.has_raw_value,env_medium,geo_loc_name,lat_lon,water_content,soil_horizon,elev"
filt = f"part_of:{study}"
cursor = "*"
all_results = []


# Use cursor pagination to get results
while True:
    url = f"{base_url}/biosamples?filter={filt}&per_page={per_page}&cursor={cursor}&fields={fields}"
    resp = requests.get(url)
    data = resp.json()
    results = data["results"]
    cursor = data["meta"]["next_cursor"]
    all_results.extend(results)
    if not cursor:
        break

print(f"Total number of biosamples: {len(all_results)}")

Total number of biosamples: 4443


In [43]:
# Find total number of NEON coordinates
# Find the total number of geo locations (equivalent to NEON sites)
# Find the total  number of collection dates
coordinates = set()
geo_locs = set()
dates = set()
for samp in all_results:
    lat_lon = f"{samp['lat_lon']['latitude']},{samp['lat_lon']['longitude']}"
    coordinates.add(lat_lon)
    geo_locs.add(samp['geo_loc_name']['has_raw_value'])
    dates.add(samp['collection_date']['has_raw_value'])
print(f"Number of NEON coordinates: {len(coordinates)}")
print(f"Number of geo locations (sites): {len(geo_locs)}")
print(f"Number of dates: {len(dates)}")

Number of NEON coordinates: 495
Number of geo locations (sites): 47
Number of dates: 4220


In [44]:
# print the first result
print(all_results[0])

{'collection_date': {'has_raw_value': '2016-07-26T01:30Z'}, 'elev': 677.6, 'env_medium': {'term': {'id': 'ENVO:00001998', 'name': 'soil'}}, 'id': 'nmdc:bsm-11-002vgm56', 'ph': 6.04, 'soil_horizon': 'O horizon', 'water_content': ['2.667 g of water/g of dry soil'], 'geo_loc_name': {'has_raw_value': 'USA: Alaska, Healy'}, 'lat_lon': {'latitude': 63.875088, 'longitude': -149.210438}}


In [46]:
# Convert results to dataframes
# Transform results to desired format and convert to a data frame
df_inp = []
water_content_units = "g of water/g of dry soil"
for biosamp in all_results:

    # Get only month, day, and year from collection_date (remove times)
    date = datetime.strptime(biosamp["collection_date"]["has_raw_value"],'%Y-%m-%dT%H:%MZ')
    date = date.strftime('%Y-%m-%d')
    
    # Remove sample where water_content is not given. Extract out units and convert to float
    if 'water_content' in biosamp:
        water_content = float("".join(biosamp["water_content"]).replace(water_content_units,""))

    if 'ph' in biosamp:
        ph = float(biosamp['ph'])
    else:
        ph = None

    rec = {"id": biosamp["id"],
           "collection_date": date,
           "soil_horizon": biosamp["soil_horizon"],
           "water_content": water_content,
           "ph": ph,
          "elev": float(biosamp["elev"]),
          "location": biosamp["geo_loc_name"]["has_raw_value"],
          "latitude": biosamp["lat_lon"]["latitude"],
          "longitude": biosamp["lat_lon"]["longitude"]}

    df_inp.append(rec)

df = pd.DataFrame(df_inp)

df  

Unnamed: 0,id,collection_date,soil_horizon,water_content,ph,elev,location,latitude,longitude
0,nmdc:bsm-11-002vgm56,2016-07-26,O horizon,2.667,6.04,677.6,"USA: Alaska, Healy",63.875088,-149.210438
1,nmdc:bsm-11-00dkyf35,2019-03-13,M horizon,0.113,6.65,381.8,"USA: California, San Joaquin Experimental Range",37.110011,-119.735218
2,nmdc:bsm-11-00hrxp98,2016-08-03,O horizon,0.992,3.90,199.7,"USA: Massachusetts, Harvard Forest & Quabbin W...",42.427091,-72.229737
3,nmdc:bsm-11-00m15h97,2020-06-23,M horizon,0.032,7.07,1649.3,"USA: Colorado, Central Plains Experimental Range",40.818371,-104.746715
4,nmdc:bsm-11-00yhef97,2016-07-26,M horizon,0.032,6.47,44.8,"USA: Georgia, The Jones Center At Ichauway",31.189774,-84.465861
...,...,...,...,...,...,...,...,...,...
4438,nmdc:bsm-11-zy2p7j24,2018-07-17,M horizon,0.302,7.01,405.7,"USA: Kansas, Konza Prairie Biological Station",39.103068,-96.563925
4439,nmdc:bsm-11-zyh2rm11,2020-07-28,M horizon,0.399,6.24,405.4,"USA: Kansas, Konza Prairie Biological Station",39.102192,-96.561180
4440,nmdc:bsm-11-zyhk8g66,2017-06-21,M horizon,0.306,7.15,272.5,"USA: Tennessee, Oak Ridge",35.957646,-84.261838
4441,nmdc:bsm-11-zzdpcm17,2020-06-03,M horizon,0.283,5.07,260.1,"USA: Tennessee, Oak Ridge",35.965875,-84.230646


In [47]:
# Find middle coordinates to inform map center
min_lat = df["latitude"].min()
max_lat = df["latitude"].max()

min_lon = df["longitude"].min()
max_lon = df["longitude"].max()

def find_square_midpoint(min_lat, max_lon, max_lat, min_lon):
    # calculate midpoint latitude
    mid_lat = (min_lat + max_lat) / 2
    
    # calculate midpoint longitude
    if abs(max_lon - min_lon) <= 180:
        mid_lon = (min_lon + max_lon) / 2
    else:
        # If the line crosses the 180 degree meridian, adjust the midpoint longitude
        mid_lon = ((max_lon + min_lon + 360) % 360) / 2 - 180
   
    return int(round(mid_lat, 0)), int(round(mid_lon, 0))

mid_coords = find_square_midpoint(min_lat, max_lon, max_lat, min_lon)
print(mid_coords)

(45, -112)


In [48]:
# Create map of NEON sites 
m = folium.Map(location=(mid_coords), zoom_start=3)

# group data frames by site (e.g. location)
grouped = df.groupby('location')
result_dfs = {}
for name, group_df in grouped:
    result_dfs[name] = group_df.reset_index(drop=True)

# Add markers to map based on location name (site) - arbitrarily picked first coordinates 
# for each df since coordinates per site differ by a negligble amount
for name, site_df in result_dfs.items():
    chosen_lat = site_df["latitude"].mean()
    chosen_lon = site_df["longitude"].mean()
#     chosen_lat = site_df['latitude'].iloc[0]
#     chosen_lon = site_df['longitude'].iloc[0]

    # Create scatter plot of pH vs. time and add a linear regression
    scatter = Chart(site_df).mark_circle().encode(x="collection_date", y='ph')
    regression = scatter.transform_regression('collection_date','ph').mark_line()
    chart = (scatter + regression).properties(width=600, height=400, title=f'{name}: Change in soil pH over time')

    # Add charts as popup for each NEON site on the map
    vega_lite = folium.VegaLite(chart, width="100%", height="100%")
    marker = folium.Marker([chosen_lat, chosen_lon])
    popup = folium.Popup()
    vega_lite.add_to(popup)
    popup.add_to(marker)
    marker.add_to(m)

m