In [1]:
# import os
import pprint
import csv
import pandas as pd
from nmdc_schema.nmdc import FieldResearchSite, Biosample, ControlledIdentifiedTermValue, OntologyClass, QuantityValue, \
    GeolocationValue, TextValue
from linkml_runtime.dumpers import yaml_dumper
import re
import random
import requests
import requests_cache

- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/package/DP1.10107.001/GUAN/2019-09?package=expanded
  - obtains a zip archive
- https://data.neonscience.org/api/v0/data/package/DP1.10107.001/GUAN/2019-09?package=expanded&release=RELEASE-2023
  - obtains a zip archive
- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/DP1.10107.001/GUAN/2019-09?package=expanded
  - obtains a JSON object with links to files
- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/DP1.10107.001/GUAN/2019-09/NEON.D04.GUAN.DP1.10107.001.mms_rawDataFiles.2019-09.expanded.20230113T225344Z.csv?package=expanded
  - obtains a CSV file,
  - but you have to know the exact name of the file


## Releases
- RELEASE-2023
- RELEASE-2022
- RELEASE-2021

## Some Domain Info
- https://www.geobabble.org/~hnw/neon/withindomainrep/
- https://www.geobabble.org/~hnw/neon/withindomainrep/neondomains2.html
- https://www.neonscience.org/field-sites/about-field-sites

## "File categories" for DP1.10107.001, "Soil microbe metagenome sequences"

```JSON
{'EML',
 'categoricalCodes',
 'mms_metagenomeDnaExtraction',
 'mms_metagenomeSequencing',
 'mms_rawDataFiles',
 'readme',
 'validation',
 'variables'}
```

Soil physical and chemical properties, periodic https://data.neonscience.org/data-products/DP1.10086.001

File naming conventions: https://www.neonscience.org/data-samples/data-management/data-formats-conventions

Named location details can be retrieved from https://data.neonscience.org/api/v0/locations/{namedLocation}, like https://data.neonscience.org/api/v0/locations/ABBY_004.basePlot.bgc

note the availability locationParent and locationChildren... could just iterate over everything top down?!

Another source of info:
- locationPropertyName	"Value for DEIMS-SDR Site ID"
- locationPropertyValue	"https://deims.org/0e39569d-ff81-4b78-9c34-b07309504d80"

https://deims.org/api/sites/0e39569d-ff81-4b78-9c34-b07309504d80

even more geospatially specific:  https://data.neonscience.org/api/v0/locations/ABBY_004.basePlot.bgc.21, but locations with that pattern may not be present in our data

## Sample lookups may be helpful:
- https://data.neonscience.org/api/v0/samples/classes?sampleTag=BART_004-M-3-28-20140819
- https://data.neonscience.org/api/v0/samples/view?sampleTag=BART_004-M-3-28-20140819&sampleClass=sls_soilCoreCollection_in.sampleID
- https://data.neonscience.org/api/v0/samples/download?sampleTag=BART_004-M-3-28-20140819&sampleClass=sls_soilCoreCollection_in.sampleID&degree=999

the neonUtilities R package provides a helpful table_types dataframe

----

consider the sample with a `uid` of	'8af5ccf9-e0bc-466d-bf36-357ba5adc3a0' in the `sls_soilCoreCollection` table

| sls_soilCoreCollection   column | value                                |
|---------------------------------|--------------------------------------|
| uid                             | 8af5ccf9-e0bc-466d-bf36-357ba5adc3a0 |
| domainID                        | D16                                  |
| siteID                          | ABBY                                 |
| plotID                          | ABBY_002                             |
| namedLocation                   | ABBY_002.basePlot.bgc                |
| plotType                        | distributed                          |
| nlcdClass                       | grasslandHerbaceous                  |
| subplotID                       |                                      |
| coreCoordinateX                 | 13                                   |
| coreCoordinateY                 | 4                                    |
| geodeticDatum                   | WGS84                                |
| decimalLatitude                 | 45.739055                            |
| decimalLongitude                | -122.309112                          |
| coordinateUncertainty           | 20.1                                 |
| elevation                       | 638.4                                |
| elevationUncertainty            | 0.1                                  |
| samplingProtocolVersion         | NEON.DOC.014048vG                    |
| startDate                       | 2016-10-04T17:10Z                    |
| collectDate                     | 2016-10-04T17:10Z                    |
| sampleTiming                    | dryWetTransition                     |
| biophysicalCriteria             |                                      |
| eventID                         |                                      |
| standingWaterDepth              | 0                                    |
| nTransBoutType                  | No                                   |
| boutType                        | microbes                             |
| samplingImpractical             |                                      |
| incubationMethod                |                                      |
| incubationCondition             |                                      |
| sampleID                        | ABBY_002-O-13-4-20161004             |
| sampleCode                      |                                      |
| toxicodendronPossible           |                                      |
| horizon                         | O                                    |
| horizonDetails                  |                                      |
| soilTemp                        | 9.8                                  |
| litterDepth                     | 1                                    |
| sampleTopDepth                  | 0                                    |
| sampleBottomDepth               | 8                                    |
| sampleExtent                    | entire                               |
| soilSamplingDevice              | brownie cutter                       |
| soilCoreCount                   | 2                                    |
| geneticSampleID                 | ABBY_002-O-13-4-20161004-GEN         |
| geneticSampleCode               |                                      |
| geneticSampleCondition          | OK                                   |
| geneticSamplePrepMethod         |                                      |
| geneticArchiveSample1ID         | ABBY_002-O-13-4-20161004-GA1         |
| geneticArchiveSample1Code       |                                      |
| geneticArchiveSample2ID         | ABBY_002-O-13-4-20161004-GA2         |
| geneticArchiveSample2Code       |                                      |
| geneticArchiveSample3ID         | ABBY_002-O-13-4-20161004-GA3         |
| geneticArchiveSample3Code       |                                      |
| geneticArchiveSample4ID         | ABBY_002-O-13-4-20161004-GA4         |
| geneticArchiveSample4Code       |                                      |
| geneticArchiveSample5ID         | ABBY_002-O-13-4-20161004-GA5         |
| geneticArchiveSample5Code       |                                      |
| geneticArchiveSamplePrepMethod  |                                      |
| geneticArchiveContainer         |                                      |
| biomassID                       |                                      |
| biomassCode                     |                                      |
| biomassSampleCondition          |                                      |
| remarks                         |                                      |
| collectedBy                     | dcrandall@neoninc.org                |
| dataQF                          |                                      |

The `sampleID` is 'ABBY_002-O-13-4-20161004'

https://data.neonscience.org/api/v0/samples/classes?sampleTag=ABBY_002-O-13-4-20161004

```json
{"data":{"sampleClasses":["sls_soilMoisture_in.moistureSampleID","sls_soilCoreCollection_in.sampleID"]}}
```

https://data.neonscience.org/api/v0/samples/download?sampleTag=ABBY_002-O-13-4-20161004&sampleClass=sls_soilCoreCollection_in.sampleID&degree=999

### omitting "sampleEvents"

```json
{
  "sampleClass": "sls_soilCoreCollection_in.sampleID",
  "sampleTag": "ABBY_002-O-13-4-20161004",
  "barcode": null,
  "archiveGuid": null,
  "sampleUuid": "b59a39c8-14d5-4405-ba4c-bb0d773706e4",
  "parentSampleIdentifiers": null,
  "childSampleIdentifiers": [
    {
      "sampleUuid": "0238c5f7-04c3-43e3-ba08-48bbed2baa23",
      "sampleTag": "ABBY_002-O-13-4-20161004",
      "sampleClass": "sls_soilMoisture_in.moistureSampleID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "7870c58d-138f-47ea-846e-f9de5f336586",
      "sampleTag": "ABBY_002-O-13-4-20161004-GA1",
      "sampleClass": "sls_soilCoreCollection_in.geneticArchiveSample1ID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "17b7ed77-abee-419e-b496-9e684440c8fd",
      "sampleTag": "ABBY_002-O-13-4-20161004-GA2",
      "sampleClass": "sls_soilCoreCollection_in.geneticArchiveSample2ID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "e8f4c6fe-2730-43d1-8034-220207ecd2e7",
      "sampleTag": "ABBY_002-O-13-4-20161004-GA3",
      "sampleClass": "sls_soilCoreCollection_in.geneticArchiveSample3ID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "c464c41f-6d14-4a28-b99b-95758c8fee7a",
      "sampleTag": "ABBY_002-O-13-4-20161004-GA4",
      "sampleClass": "sls_soilCoreCollection_in.geneticArchiveSample4ID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "d2c2bbab-a546-4558-b411-974fd473acc7",
      "sampleTag": "ABBY_002-O-13-4-20161004-GA5",
      "sampleClass": "sls_soilCoreCollection_in.geneticArchiveSample5ID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "4c54c342-2917-4812-bdf0-ff6700e8b7c5",
      "sampleTag": "ABBY_002-O-13-4-20161004-GEN",
      "sampleClass": "sls_soilCoreCollection_in.geneticSampleID",
      "barcode": null,
      "archiveGuid": null
    },
    {
      "sampleUuid": "49be492a-57a1-4300-b7d4-025d96be99b4",
      "sampleTag": "ABBY_002-O-13-4-20161004-PH",
      "sampleClass": "sls_soilpH_in.pHSampleID",
      "barcode": null,
      "archiveGuid": null
    }
  ]
}
```

what can we learn about

| sls_soilCoreCollection   column | value                                |
|---------------------------------|--------------------------------------|
| domainID                        | D16                                  |
| siteID                          | ABBY                                 |
| plotID                          | ABBY_002                             |
| namedLocation                   | ABBY_002.basePlot.bgc                |

domainID, siteID and namedLocation can be found with the location endpoint, https://data.neonscience.org/api/v0/locations/{location}

- domainID: little useful data
- siteID: could use 'USA: {Value for State Abbreviation}, {locationDescription}' for `geo_loc_name`
  - also useful: Value for HABITAT, Value for TypeEco, Value for TypeSoil
- namedLocation: could use locationDecimalLatitude, locationDecimalLongitude for `lat_lon`, locationElevation, Value for National Land Cover Database (2001), Value for Slope aspect, Value for Slope gradient, Value for Soil type order


note there is no `subplotID`... I could swear I've seen those in my research before

we can use `nlcdClass` either as is in some vegetation/land cover/land use slot, or infer one of the MIxS environmental trial slots from it


slots to possibly populate:
- (controlled values) from https://microbiomedata.github.io/nmdc-schema/gold_path_field/
  - ecosystem
  - ecosystem_category
  - ecosystem_type
  - ecosystem_subtype
  - specific_ecosystem
- https://microbiomedata.github.io/nmdc-schema/cur_land_use/ (https://microbiomedata.github.io/nmdc-schema/CurLandUseEnum/)
- https://microbiomedata.github.io/nmdc-schema/cur_vegetation/ (TEXT VALUE)
- https://microbiomedata.github.io/nmdc-schema/depos_env/ https://microbiomedata.github.io/nmdc-schema/DeposEnvEnum/
- https://microbiomedata.github.io/nmdc-schema/env_broad_scale/ (subclasses of biome, http://purl.obolibrary.org/obo/ENVO_00000428)
- https://microbiomedata.github.io/nmdc-schema/env_local_scale/ ("EnvO terms which are of smaller spatial grain than your entry for env_broad_scale")
- https://microbiomedata.github.io/nmdc-schema/env_medium/ (subclass of 'environmental material', http://purl.obolibrary.org/obo/ENVO_00010483)
- https://microbiomedata.github.io/nmdc-schema/fao_class/ (https://microbiomedata.github.io/nmdc-schema/FaoClassEnum/)
- https://microbiomedata.github.io/nmdc-schema/local_class/ (TEXT VALUE)
- https://microbiomedata.github.io/nmdc-schema/soil_type/ (envo terms?)

might also want to look at neon's categorical codes files

still haven't taken any action on the demis data

nlcd xml file:

it looks like nlcd data is present in EnvO

are all of the neon records tied to the same nlcd release?

In [2]:
requests_cache.install_cache('neon_cache', backend='sqlite', expire_after=43200) # 12 hours


In [3]:
# product_code = "DP1.10107.001"
product_code = "DP1.10086.001"
release_tag = "RELEASE-2023"

In [4]:
useful_file_categories = [
    'mms_metagenomeDnaExtraction',
    'mms_metagenomeSequencing',
    'mms_rawDataFiles',
    'sls_bgcSubsampling',
    'sls_metagenomicsPooling',
    'sls_soilChemistry',
    'sls_soilCoreCollection',
    'sls_soilMoisture',
    'sls_soilpH',
]

{ 'ntr_externalLab',
 'ntr_internalLab',
 'ntr_internalLabBlanks',

  'EML',
 'categoricalCodes',

 'readme',
 'sls_bgcSubsampling',
 'sls_metagenomicsPooling',
 'sls_soilChemistry',
 'sls_soilCoreCollection',
 'sls_soilMoisture',
 'sls_soilpH',
 'validation',
 'variables'}


{'EML',
 'categoricalCodes',
 'ntr_externalLab',
 'ntr_internalLab',
 'ntr_internalLabBlanks',
 'readme',
 'sls_bgcSubsampling',
 'sls_metagenomicsPooling',
 'sls_soilChemistry',
 'sls_soilCoreCollection',
 'sls_soilMoisture',
 'sls_soilpH',
 'validation',
 'variables'}

In [5]:
# neon_dl_dir = "/Users/MAM/Downloads/NEON_seq-metagenomic-microbe-soil"
# neon_dl_dir_tokens = neon_dl_dir.split("/")
# neon_basename = neon_dl_dir_tokens[-1]
# output_file = f"../local/{neon_basename}.tsv"


In [6]:
def write_dicts_to_tsv(data, filename):
    fieldnames = data[0].keys()  # Assume all dictionaries have the same keys

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)

In [7]:
def remove_unwanted_characters(string):
    # Use regex to match characters that are not in [A-Za-z0-9]
    pattern = r'[^A-Za-z0-9]'
    # Replace the unwanted characters with an empty string
    cleaned_string = re.sub(pattern, '', string)
    return cleaned_string

In [8]:
# subdirectories = [name for name in os.listdir(neon_dl_dir) if os.path.isdir(os.path.join(neon_dl_dir, name))]
#
# len_list = []
# token_dict_list = []
# for i in subdirectories:
#     tokens = i.split(".")
#     len_list.append(len(tokens))
#     temp = {
#         "data_owner": tokens[0],
#         "geographical_domain": tokens[1],
#         "siteCode": tokens[2],
#         "productCode": f"{tokens[3]}.{tokens[4]}.{tokens[5]}",
#         "year-month": tokens[6],
#         "package": tokens[7],
#         "timestamp": tokens[8],
#         "releaseTag": tokens[9],
#     }
#     token_dict_list.append(temp)


In [9]:
# write_dicts_to_tsv(token_dict_list, output_file)

In [10]:
# first_token_dict = token_dict_list[0]
# pprint.pprint(first_token_dict)


In [11]:
# product_code = first_token_dict['productCode']
# release_tag = first_token_dict['releaseTag']


In [12]:
url = f"https://data.neonscience.org/api/v0/products/{product_code}?release={release_tag}"
response = requests.get(url).json()
site_wise = response['data']['siteCodes']

In [None]:
# ONLY FOR RETRIEVING TABLES
# is caching making this any faster? it does for the other API requests

# todo add the ability to iterate over products
file_categories = set()
df_dict = {}

for i in site_wise:
    availableDataUrls = i['availableDataUrls']
    for j in availableDataUrls:
        data_files_response = requests.get(j).json()
        if 'data' not in data_files_response:
            print("Skipping", j)
            continue
        if 'files' not in data_files_response['data']:
            print("Skipping", j)
            continue
        data_file_objects = data_files_response['data']['files']
        for k in data_file_objects:
            data_file_name = k['name']
            print(data_file_name)
            data_file_name_tokens = data_file_name.split(".")
            if len(data_file_name_tokens) < 6:
                print("Skipping", data_file_name)
                continue
            data_file_name_dict = {
                "data_owner": data_file_name_tokens[0],
                "geographical_domain": data_file_name_tokens[1],
                "siteCode": data_file_name_tokens[2],
                "productCode": f"{data_file_name_tokens[3]}.{data_file_name_tokens[4]}.{data_file_name_tokens[5]}",
                "file_category": data_file_name_tokens[6],
                # "timestamp": data_file_name_tokens[7],
                # "extension": data_file_name_tokens[8],
            }

            file_categories.add(data_file_name_dict['file_category'])
            if data_file_name_dict['file_category'] in useful_file_categories:
                if data_file_name_dict['file_category'] in df_dict:
                    more = pd.read_csv(k['url'])
                    df_dict[data_file_name_dict['file_category']] = pd.concat(
                        [df_dict[data_file_name_dict['file_category']], more], ignore_index=True)
                else:
                    df_dict[data_file_name_dict['file_category']] = pd.read_csv(k['url'])

for k, v in df_dict.items():
    print(k)
    deduped = v.drop_duplicates()
    deduped.to_csv(f"../local/{k}.csv", index=False)

pprint.pprint(file_categories)

In [13]:
sls_soilCoreCollection = pd.read_csv("../local/sls_soilCoreCollection.csv", low_memory=False) # only required if the data frames created from neon requests, and stored in df_dict, aren't sill in memory

# # otherwise
# sls_soilCoreCollection = df_dict['sls_soilCoreCollection']

print(sls_soilCoreCollection.shape)
# should have done deduplication BEFORE creating the dataframes or saving the files
#   WHY are we getting duplicate rows? (exactly 2x?)

sls_soilCoreCollection = sls_soilCoreCollection.drop_duplicates()
print(sls_soilCoreCollection.shape)

(28723, 62)
(28723, 62)


In [14]:
print(sls_soilCoreCollection.columns)

Index(['uid', 'domainID', 'siteID', 'plotID', 'namedLocation', 'plotType',
       'nlcdClass', 'subplotID', 'coreCoordinateX', 'coreCoordinateY',
       'geodeticDatum', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertainty', 'elevation', 'elevationUncertainty',
       'samplingProtocolVersion', 'startDate', 'collectDate', 'sampleTiming',
       'biophysicalCriteria', 'eventID', 'standingWaterDepth',
       'nTransBoutType', 'boutType', 'samplingImpractical', 'incubationMethod',
       'incubationCondition', 'sampleID', 'sampleCode',
       'toxicodendronPossible', 'horizon', 'horizonDetails', 'soilTemp',
       'litterDepth', 'sampleTopDepth', 'sampleBottomDepth', 'sampleExtent',
       'soilSamplingDevice', 'soilCoreCount', 'geneticSampleID',
       'geneticSampleCode', 'geneticSampleCondition',
       'geneticSamplePrepMethod', 'geneticArchiveSample1ID',
       'geneticArchiveSample1Code', 'geneticArchiveSample2ID',
       'geneticArchiveSample2Code', 'geneticArchiveS

In [15]:
print(sls_soilCoreCollection.head())

                                    uid domainID siteID    plotID  \
0  8af5ccf9-e0bc-466d-bf36-357ba5adc3a0      D16   ABBY  ABBY_002   
1  ba0fb254-9d65-40fe-bdde-6c9d9d7136c7      D16   ABBY  ABBY_002   
2  ebf65071-8b5e-4f25-8c11-324564274a5b      D16   ABBY  ABBY_002   
3  9d4ff542-12db-411c-be8e-f629c0813b38      D16   ABBY  ABBY_004   
4  f0d08cd0-7a54-4b8d-a0a0-9997dec91381      D16   ABBY  ABBY_004   

           namedLocation     plotType            nlcdClass  subplotID  \
0  ABBY_002.basePlot.bgc  distributed  grasslandHerbaceous        NaN   
1  ABBY_002.basePlot.bgc  distributed  grasslandHerbaceous        NaN   
2  ABBY_002.basePlot.bgc  distributed  grasslandHerbaceous        NaN   
3  ABBY_004.basePlot.bgc  distributed           shrubScrub        NaN   
4  ABBY_004.basePlot.bgc  distributed           shrubScrub        NaN   

   coreCoordinateX  coreCoordinateY  ...           geneticArchiveSample5ID  \
0             13.0              4.0  ...      ABBY_002-O-13-4-201610

In [16]:
# print(sls_soilCoreCollection['domainID'].value_counts())

In [17]:
token = "eyJ0eXAiOiJKV1QiLCJhbGciOiJFUzI1NiJ9.eyJhdWQiOiJodHRwczovL2RhdGEubmVvbnNjaWVuY2Uub3JnL2FwaS92MC8iLCJzdWIiOiJNQU1AbGJsLmdvdiIsInNjb3BlIjoicmF0ZTpwdWJsaWMiLCJpc3MiOiJodHRwczovL2RhdGEubmVvbnNjaWVuY2Uub3JnLyIsImV4cCI6MTg0MDYzMTY3MywiaWF0IjoxNjgyOTUxNjczLCJlbWFpbCI6Ik1BTUBsYmwuZ292In0.H0P7ke_WL7syECGAA4khEddZ8f6sR__vA3TFherLVt8I1omtYNjspqwWZh42ZkoCbCmRTIr4b4OG8uqPhICv8g"

headers = {'X-API-Token': token}

url_base = "https://data.neonscience.org/api/v0/locations"

In [18]:
unique_domains = sls_soilCoreCollection['domainID'].unique()
unique_domains.sort()
domains = []

for idx, i in enumerate(unique_domains):
    print(i)
    url = f"{url_base}/{i}"
    response = requests.get(url, headers=headers).json()
    domain_id = f"nmdc:frsite-99-{response['data']['domainCode']}"
    domain = FieldResearchSite(id=domain_id, name=response['data']['locationName'],
                               description=response['data']['locationDescription'])
    domains.append(domain)
# print(yaml_dumper.dumps(domains))

D01
D02
D03
D04
D05
D06
D07
D08
D09
D10
D11
D12
D13
D14
D15
D16
D17
D18
D19
D20


In [19]:
# print(yaml_dumper.dumps(sites))

In [20]:
unique_named_locations = sls_soilCoreCollection['namedLocation'].unique()
unique_named_locations.sort()
named_locations = []
location_count = len(unique_named_locations)

for idx, i in enumerate(unique_named_locations):
    # print(i)
    print(f"{i} = {idx + 1} of {location_count}")
    url = f"{url_base}/{i}"
    response = requests.get(url, headers=headers).json()
    location_id = f"nmdc:frsite-99-{remove_unwanted_characters(response['data']['locationName'])}"
    if 'data' not in response:
        print("Skipping", i)
        continue
    location = FieldResearchSite(
        id=location_id,
        name=response['data']['locationName'],
        description=response['data']['locationDescription'],
    )
    if 'siteCode' in response['data']:
        site_id = f"nmdc:frsite-99-{response['data']['siteCode']}"
        location.part_of = [site_id]
    else:
        print("No siteCode for", i)

    # list of single-key dicts
    location_properties_dict = {}
    location_properties_list = response['data']['locationProperties']
    for l in location_properties_list:
        if 'locationPropertyValue' in l and l['locationPropertyValue'] and 'locationPropertyName' in l:
            substring_to_remove = "Value for "

            # Check if the input string starts with the substring
            if l['locationPropertyName'].startswith(substring_to_remove):
                # Remove the substring from the beginning of the string
                succinct_name = l['locationPropertyName'][len(substring_to_remove):]
            else:
                # The input string does not start with the substring
                succinct_name = l['locationPropertyName']

            location_properties_dict[succinct_name] = l['locationPropertyValue']

    if "National Land Cover Database (2001)" in location_properties_dict:
        cur_vegetation_text_val = TextValue(
            has_raw_value=location_properties_dict["National Land Cover Database (2001)"])

        location.cur_vegetation = cur_vegetation_text_val

    if "Soil type order" in location_properties_dict:
        soil_type_text_val = TextValue(has_raw_value=location_properties_dict["Soil type order"])
        location.local_class = soil_type_text_val

    named_locations.append(location)


ABBY_001.basePlot.bgc = 1 of 549
ABBY_002.basePlot.bgc = 2 of 549
ABBY_003.basePlot.bgc = 3 of 549
ABBY_004.basePlot.bgc = 4 of 549
ABBY_006.basePlot.bgc = 5 of 549
ABBY_023.basePlot.bgc = 6 of 549
ABBY_061.basePlot.bgc = 7 of 549
ABBY_062.basePlot.bgc = 8 of 549
ABBY_063.basePlot.bgc = 9 of 549
ABBY_070.basePlot.bgc = 10 of 549
BARR_001.basePlot.bgc = 11 of 549
BARR_002.basePlot.bgc = 12 of 549
BARR_003.basePlot.bgc = 13 of 549
BARR_004.basePlot.bgc = 14 of 549
BARR_005.basePlot.bgc = 15 of 549
BARR_006.basePlot.bgc = 16 of 549
BARR_051.basePlot.bgc = 17 of 549
BARR_052.basePlot.bgc = 18 of 549
BARR_053.basePlot.bgc = 19 of 549
BARR_054.basePlot.bgc = 20 of 549
BART_001.basePlot.bgc = 21 of 549
BART_002.basePlot.bgc = 22 of 549
BART_003.basePlot.bgc = 23 of 549
BART_004.basePlot.bgc = 24 of 549
BART_005.basePlot.bgc = 25 of 549
BART_006.basePlot.bgc = 26 of 549
BART_023.basePlot.bgc = 27 of 549
BART_024.basePlot.bgc = 28 of 549
BART_028.basePlot.bgc = 29 of 549
BART_032.basePlot.bgc =

In [21]:
# print(yaml_dumper.dumps(named_locations))

In [22]:
unique_sites = sls_soilCoreCollection['siteID'].unique()
unique_sites.sort()
sites = []
for idx, i in enumerate(unique_sites):
    print(i)
    url = f"{url_base}/{i}"
    response = requests.get(url, headers=headers).json()
    site_id = f"nmdc:frsite-99-{response['data']['siteCode']}"
    site = FieldResearchSite(
        id=site_id,
        name=response['data']['locationName'],
        description=response['data']['locationDescription'],
    )
    if 'domainCode' in response['data']:
        domain_id = f"nmdc:frsite-99-{response['data']['domainCode']}"
        site.part_of = [domain_id]
    if 'locationElevation' in response['data']:
        site.elev = QuantityValue(
            has_numeric_value=response['data']['locationElevation'],
            has_unit="meters")
    if 'locationDecimalLatitude' in response['data'] and 'locationDecimalLongitude' in response['data']:
        site.lat_lon = GeolocationValue(
            latitude=response['data']['locationDecimalLatitude'],
            longitude=response['data']['locationDecimalLongitude'],
        )

    # list of single-key dicts
    location_properties_dict = {}
    location_properties_list = response['data']['locationProperties']
    for l in location_properties_list:
        if 'locationPropertyValue' in l and l['locationPropertyValue'] and 'locationPropertyName' in l:
            substring_to_remove = "Value for "

            # Check if the input string starts with the substring
            if l['locationPropertyName'].startswith(substring_to_remove):
                # Remove the substring from the beginning of the string
                succinct_name = l['locationPropertyName'][len(substring_to_remove):]
            else:
                # The input string does not start with the substring
                succinct_name = l['locationPropertyName']

            location_properties_dict[succinct_name] = l['locationPropertyValue']

    if 'Country' in location_properties_dict and 'State Abbreviation' in location_properties_dict and 'locationDescription' in \
            response['data']:
        geo_loc_name_obj = TextValue(
            has_raw_value=f"{location_properties_dict['Country']}: {location_properties_dict['State Abbreviation']}, {response['data']['locationDescription']}")
        site.geo_loc_name = geo_loc_name_obj

    if "DEIMS-SDR Site ID" in location_properties_dict:
        prefix = "https://deims.org/api/sites"
        browsable = location_properties_dict["DEIMS-SDR Site ID"]
        browsable_tokens = browsable.split("/")
        site_code = browsable_tokens[-1]
        # convert https://deims.org/0e39569d-ff81-4b78-9c34-b07309504d80
        # to https://deims.org/api/sites/0e39569d-ff81-4b78-9c34-b07309504d80
        site.neon_deims_sdr_link = f"{prefix}/{site_code}"

    if "HABITAT" in location_properties_dict:
        site.habitat = location_properties_dict["HABITAT"]  # gold term... controlled vocabulary?

    if "TypeSoil" in location_properties_dict:
        soil_type_text_val = TextValue(has_raw_value=location_properties_dict["TypeSoil"])
        site.neon_soil_type = location_properties_dict["TypeSoil"]

    if "TypeEco" in location_properties_dict:
        site.neon_eco_type = location_properties_dict[
            "TypeEco"]  # avoiding GOLD's ecosystem slot, with a controlled vocab

    sites.append(site)

# print(yaml_dumper.dumps(sites))

ABBY
BARR
BART
BLAN
BONA
CLBJ
CPER
DCFS
DEJU
DELA
DSNY
GRSM
GUAN
HARV
HEAL
JERC
JORN
KONA
KONZ
LAJA
LENO
MLBS
MOAB
NIWO
NOGP
OAES
ONAQ
ORNL
OSBS
PUUM
RMNP
SCBI
SERC
SJER
SOAP
SRER
STEI
STER
TALL
TEAK
TOOL
TREE
UKFS
UNDE
WOOD
WREF
YELL


In [23]:
# sample_class = "sls_soilCoreCollection_in.sampleID"
#
# unique_biosamples = sls_soilCoreCollection['sampleID'].unique()
# unique_biosamples.sort()
#
# # pprint.pprint(unique_biosamples)
#
# # https://data.neonscience.org/api/v0/samples/view?sampleTag=BART_004-M-3-28-20140819&sampleClass=sls_soilCoreCollection_in.sampleID
#
# url_base = "https://data.neonscience.org/api/v0/samples/view"
#
# biosamples = []
#
# for idx, i in enumerate(unique_biosamples[0:33]):
#     print(i)
#     url = f"{url_base}/{i}"
#     params = {
#         "sampleClass": sample_class,
#         "sampleTag": i,
#     }
#     response = requests.get(url_base, headers=headers, params=params).json()
#     pprint.pprint(response)
#     # site_id = f"nmdc:frsite-99-site{idx}"
#     # if 'data' not in response:
#     #     print("Skipping", i)
#     #     continue
#     # domain = FieldResearchSite(id=site_id, name=response['data']['locationName'],
#     #                            description=response['data']['locationDescription'])
#     # named_locations.append(domain)

In [26]:
cores_lod = sls_soilCoreCollection.to_dict(orient='records')


In [27]:
biosamples = []

# if we want to know the "exact" latitude and longitude of the core/biosamples
#   we would have to do arithmetic on the plot's latitude and longitude
#   and the core's offset (in meters?) from the plot (center?)

study_id = "nmdc:sty-99-neon"

for idx, i in enumerate(cores_lod):
    # print(i)

    if random.random() > 0.05:
        continue

    core_id = f"nmdc:bsm-99-{remove_unwanted_characters(i['sampleID'])}"
    print(core_id)
    location_id = f"nmdc:frsite-99-{remove_unwanted_characters(i['namedLocation'])}"
    core = Biosample(
        id=core_id,
        part_of=[study_id],
        env_broad_scale=ControlledIdentifiedTermValue(term=OntologyClass(id="ENVO:00000446", name="terrestrial biome")),
        env_local_scale=ControlledIdentifiedTermValue(term=OntologyClass(id="ENVO:01001886", name="landform")),
        env_medium=ControlledIdentifiedTermValue(term=OntologyClass(id="ENVO:00001998", name="soil")),
        collected_from=location_id,
        cur_vegetation=TextValue(has_raw_value=i["nlcdClass"]),
    )
    biosamples.append(core)
# print(yaml_dumper.dumps(biosamples))

nmdc:bsm-99-ABBY061M1330520161004
nmdc:bsm-99-ABBY023O333420161005
nmdc:bsm-99-ABBY062O1535520161005
nmdc:bsm-99-ABBY003M26539520170511
nmdc:bsm-99-ABBY063M112520170606
nmdc:bsm-99-ABBY063M250520170606
nmdc:bsm-99-ABBY063M451520171017
nmdc:bsm-99-ABBY061M55820180419
nmdc:bsm-99-ABBY061M751320180702
nmdc:bsm-99-ABBY002M0320181022
nmdc:bsm-99-ABBY070M16220181023
nmdc:bsm-99-ABBY070M37220190417
nmdc:bsm-99-ABBY023M283320190419
nmdc:bsm-99-ABBY004M3552020190703
nmdc:bsm-99-ABBY062M112520191024
nmdc:bsm-99-ABBY070M352120200730
nmdc:bsm-99-ABBY070O359520200730
nmdc:bsm-99-ABBY062M352520200731
nmdc:bsm-99-ABBY004O11420201026
nmdc:bsm-99-ABBY002O283520201026
nmdc:bsm-99-ABBY061M3151720201027
nmdc:bsm-99-ABBY062O345620201028
nmdc:bsm-99-ABBY001M2535520201028
nmdc:bsm-99-ABBY001M265720210510
nmdc:bsm-99-ABBY002M25220210719
nmdc:bsm-99-ABBY006O2950520211018
nmdc:bsm-99-BARR053O23220170808
nmdc:bsm-99-BARR001O193120170809
nmdc:bsm-99-BARR0023920200727OTHER
nmdc:bsm-99-BARR0062120200727OTHER
nmdc:b

In [28]:
locations_as_frs = domains + sites + named_locations
database = {
    "field_research_site_set": locations_as_frs,
    "biosample_set": biosamples,
}

yaml_dumper.dump(database, "../local/neon_in_nmdc.yaml")

In [None]:
# todo add linkml validation and jsonschema validation