# Example 1

Plot pH over Time for the NEON Studies.

In [9]:
import requests
import json
import pandas as pd
import folium
import altair as alt
from altair import Chart


In [10]:
base_url = "https://api.microbiomedata.org"
filt = "name.search%3ANational%20Ecological%20Observatory%20Network"
url = f"{base_url}/studies?filter={filt}"
print(url)
resp = requests.get(url)
studies = resp.json()['results']
study_ids = []
for study in studies:
    print(study['id'])
    study_ids.append(study['id'])

https://api.microbiomedata.org/studies?filter=name.search%3ANational%20Ecological%20Observatory%20Network
nmdc:sty-11-34xj1150
nmdc:sty-11-hht5sb92
nmdc:sty-11-pzmd0x14


In [11]:
per_page = 100
all_results = []
fields = "ph, collection_date.has_raw_value,geo_loc_name,lat_lon"
for study_id in study_ids:
    filt = f"part_of:{study_id}"
    get_more = True
    tot = 0
    page = 1
    while get_more:
        url = f"{base_url}/biosamples?filter={filt}&per_page={per_page}&page={page}&fields={fields}"
        resp = requests.get(url)
        data = resp.json()
        results = data['results']
        for samp in results:
            if 'ph' in samp:
                all_results.append(samp)         
        tot += len(data['results'])
        if tot < data['meta']['count']:
            page += 1
        else:
            get_more = False

print(len(all_results))


4259


In [12]:
sites = set()
geo_locs = set()
dates = set()
for samp in all_results:
    lat_lon = f"{samp['lat_lon']['latitude']},{samp['lat_lon']['longitude']}"
    sites.add(lat_lon)
    geo_locs.add(samp['geo_loc_name']['has_raw_value'])
    dates.add(samp['collection_date']['has_raw_value'])
print(f"Number of NEON sites: {len(sites)}")
print(f"Number of geo locations: {len(geo_locs)}")
print(f"Number of dates: {len(dates)}")

Number of NEON sites: 490
Number of geo locations: 47
Number of dates: 4091


In [13]:
print(all_results[0])

{'collection_date': {'has_raw_value': '2017-06-05T16:50Z'}, 'id': 'nmdc:bsm-11-06qrej20', 'ph': 5.51, 'geo_loc_name': {'has_raw_value': 'USA: Washington, Abby Road'}, 'lat_lon': {'latitude': 45.76858, 'longitude': -122.298736}}


In [14]:
df_inp = []
for samp in all_results:
    year, month, day = samp["collection_date"]["has_raw_value"][0:10].split("-")
    lat_lon = f"{samp['lat_lon']['latitude']},{samp['lat_lon']['longitude']}"
    latitude = samp['lat_lon']['latitude']
    longitude = samp['lat_lon']['longitude']
    rec = {"date": samp["collection_date"]["has_raw_value"],
           "ph": samp["ph"],
           "loc": samp["geo_loc_name"]["has_raw_value"],
           "lat_lon": lat_lon,
            "latitude" : latitude,
            "longitude" : longitude}
    df_inp.append(rec)

df = pd.DataFrame(df_inp)
df['date'] = pd.to_datetime(df['date'])

df


Unnamed: 0,date,ph,loc,lat_lon,latitude,longitude
0,2017-06-05 16:50:00+00:00,5.51,"USA: Washington, Abby Road","45.76858,-122.298736",45.768580,-122.298736
1,2017-06-05 17:07:00+00:00,5.58,"USA: Washington, Abby Road","45.76858,-122.298736",45.768580,-122.298736
2,2017-06-05 17:47:00+00:00,5.53,"USA: Washington, Abby Road","45.76858,-122.298736",45.768580,-122.298736
3,2017-06-05 19:46:00+00:00,5.59,"USA: Washington, Abby Road","45.754053,-122.298512",45.754053,-122.298512
4,2017-06-05 20:09:00+00:00,4.70,"USA: Washington, Abby Road","45.754053,-122.298512",45.754053,-122.298512
...,...,...,...,...,...,...
4254,2021-07-08 19:18:00+00:00,6.32,"USA: Wyoming, Yellowstone National Park","44.954777,-110.533202",44.954777,-110.533202
4255,2021-07-08 18:42:00+00:00,6.52,"USA: Wyoming, Yellowstone National Park","44.954777,-110.533202",44.954777,-110.533202
4256,2021-07-08 21:39:00+00:00,6.41,"USA: Wyoming, Yellowstone National Park","44.948383,-110.631377",44.948383,-110.631377
4257,2021-07-08 20:44:00+00:00,6.48,"USA: Wyoming, Yellowstone National Park","44.948383,-110.631377",44.948383,-110.631377


In [15]:
# Find middle coordinates to inform map center
min_lat = min(df.latitude)
max_lat = max(df.latitude)

min_lon = min(df.longitude)
max_lon = max(df.longitude)

def find_square_midpoint(min_lat, max_lon, max_lat, min_lon):
    # calculate midpoint latitude
    mid_lat = (min_lat + max_lat) / 2
    
    # calculate midpoint longitude
    if abs(max_lon - min_lon) <= 180:
        mid_lon = (min_lon + max_lon) / 2
    else:
        # If the line crosses the 180 degree meridian, adjust the midpoint longitude
        mid_lon = ((max_lon + min_lon + 360) % 360) / 2 - 180
   
    return int(round(mid_lat, 0)), int(round(mid_lon, 0))

mid_coords = find_square_midpoint(min_lat, max_lon, max_lat, min_lon)
print(mid_coords)

(45, -112)


In [19]:
m = folium.Map(location=(mid_coords), zoom_start=3)

# group data frames by site (e.g. loc)
grouped = df.groupby('loc')
result_dfs = {}
for name, group_df in grouped:
    result_dfs[name] = group_df.reset_index(drop=True)

# Add markers to map based on location name (site) - arbitrarily picked first coordinates 
# for each df since coordinates per site differ by a negligble amount
for name, site_df in result_dfs.items():
    chosen_lat = site_df['latitude'].iloc[0]
    chosen_lon = site_df['longitude'].iloc[0]
  
    scatter = Chart(site_df).mark_circle().encode(x="date", y='ph')
    regression = scatter.transform_regression('date','ph').mark_line()
    chart = (scatter + regression).properties(width=600, height=400, title=f'{name}: Change in soil pH over time')
    
    vega_lite = folium.VegaLite(chart, width="100%", height="100%")
    marker = folium.Marker([chosen_lat, chosen_lon])
    popup = folium.Popup()
    vega_lite.add_to(popup)
    popup.add_to(marker)
    marker.add_to(m)
    
display(m)