In [1]:
import os
import pprint
import csv
import requests
import pandas as pd

- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/package/DP1.10107.001/GUAN/2019-09?package=expanded
  - obtains a zip archive
- https://data.neonscience.org/api/v0/data/package/DP1.10107.001/GUAN/2019-09?package=expanded&release=RELEASE-2023
  - obtains a zip archive
- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/DP1.10107.001/GUAN/2019-09?package=expanded
  - obtains a JSON object with links to files
- https://data.neonscience.org/api/v0/releases/RELEASE-2023/data/DP1.10107.001/GUAN/2019-09/NEON.D04.GUAN.DP1.10107.001.mms_rawDataFiles.2019-09.expanded.20230113T225344Z.csv?package=expanded
  - obtains a CSV file,
  - but you have to know the exact name of the file


## Releases
- RELEASE-2023
- RELEASE-2022
- RELEASE-2021

## Some Domain Info
- https://www.geobabble.org/~hnw/neon/withindomainrep/
- https://www.geobabble.org/~hnw/neon/withindomainrep/neondomains2.html
- https://www.neonscience.org/field-sites/about-field-sites

## "File categories" for DP1.10107.001, "Soil microbe metagenome sequences"

```JSON
{'EML',
 'categoricalCodes',
 'mms_metagenomeDnaExtraction',
 'mms_metagenomeSequencing',
 'mms_rawDataFiles',
 'readme',
 'validation',
 'variables'}
```

Soil physical and chemical properties, periodic https://data.neonscience.org/data-products/DP1.10086.001

File naming conventions: https://www.neonscience.org/data-samples/data-management/data-formats-conventions

Named location details can be retrieved from https://data.neonscience.org/api/v0/locations/{namedLocation}, like https://data.neonscience.org/api/v0/locations/ABBY_004.basePlot.bgc

even more geospatially specific:  https://data.neonscience.org/api/v0/locations/ABBY_004.basePlot.bgc.21, but locatiosn with htat pattern may not be present in our data

## Sample lookups may be helpful:
- https://data.neonscience.org/api/v0/samples/classes?sampleTag=BART_004-M-3-28-20140819
- https://data.neonscience.org/api/v0/samples/view?sampleTag=BART_004-M-3-28-20140819&sampleClass=sls_soilCoreCollection_in.sampleID
- https://data.neonscience.org/api/v0/samples/download?sampleTag=BART_004-M-3-28-20140819&sampleClass=sls_soilCoreCollection_in.sampleID&degree=999

the neonUtilities R package provides a helpful table_types dataframe

In [12]:
# product_code = "DP1.10107.001"
product_code = "DP1.10086.001"
release_tag = "RELEASE-2023"


In [3]:
useful_file_categories = [
    'mms_metagenomeDnaExtraction',
    'mms_metagenomeSequencing',
    'mms_rawDataFiles',
    'sls_bgcSubsampling',
    'sls_metagenomicsPooling',
    'sls_soilChemistry',
    'sls_soilCoreCollection',
    'sls_soilMoisture',
    'sls_soilpH',
]


In [4]:
# neon_dl_dir = "/Users/MAM/Downloads/NEON_seq-metagenomic-microbe-soil"
# neon_dl_dir_tokens = neon_dl_dir.split("/")
# neon_basename = neon_dl_dir_tokens[-1]
# output_file = f"../local/{neon_basename}.tsv"


In [5]:
def write_dicts_to_tsv(data, filename):
    fieldnames = data[0].keys()  # Assume all dictionaries have the same keys

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)

In [6]:

# subdirectories = [name for name in os.listdir(neon_dl_dir) if os.path.isdir(os.path.join(neon_dl_dir, name))]
#
# len_list = []
# token_dict_list = []
# for i in subdirectories:
#     tokens = i.split(".")
#     len_list.append(len(tokens))
#     temp = {
#         "data_owner": tokens[0],
#         "geographical_domain": tokens[1],
#         "siteCode": tokens[2],
#         "productCode": f"{tokens[3]}.{tokens[4]}.{tokens[5]}",
#         "year-month": tokens[6],
#         "package": tokens[7],
#         "timestamp": tokens[8],
#         "releaseTag": tokens[9],
#     }
#     token_dict_list.append(temp)


In [7]:
# write_dicts_to_tsv(token_dict_list, output_file)

In [8]:
# first_token_dict = token_dict_list[0]
# pprint.pprint(first_token_dict)


In [9]:
# product_code = first_token_dict['productCode']
# release_tag = first_token_dict['releaseTag']



In [10]:
url = f"https://data.neonscience.org/api/v0/products/{product_code}?release={release_tag}"
response = requests.get(url).json()
site_wise = response['data']['siteCodes']

In [11]:

file_categories = set()
df_dict = {}

for i in site_wise:
    availableDataUrls = i['availableDataUrls']
    for j in availableDataUrls:
        data_files_response = requests.get(j).json()
        if 'data' not in data_files_response:
            print("Skipping", j)
            continue
        if 'files' not in data_files_response['data']:
            print("Skipping", j)
            continue
        data_file_objects = data_files_response['data']['files']
        for k in data_file_objects:
            data_file_name = k['name']
            print(data_file_name)
            data_file_name_tokens = data_file_name.split(".")
            if len(data_file_name_tokens) < 6:
                print("Skipping", data_file_name)
                continue
            data_file_name_dict = {
                "data_owner": data_file_name_tokens[0],
                "geographical_domain": data_file_name_tokens[1],
                "siteCode": data_file_name_tokens[2],
                "productCode": f"{data_file_name_tokens[3]}.{data_file_name_tokens[4]}.{data_file_name_tokens[5]}",
                "file_category": data_file_name_tokens[6],
                # "timestamp": data_file_name_tokens[7],
                # "extension": data_file_name_tokens[8],
            }

            file_categories.add(data_file_name_dict['file_category'])
            if data_file_name_dict['file_category'] in useful_file_categories:
                if data_file_name_dict['file_category'] in df_dict:
                    more = pd.read_csv(k['url'])
                    df_dict[data_file_name_dict['file_category']] = pd.concat(
                        [df_dict[data_file_name_dict['file_category']], more], ignore_index=True)
                else:
                    df_dict[data_file_name_dict['file_category']] = pd.read_csv(k['url'])

for k, v in df_dict.items():
    print(k)
    v.to_csv(f"../local/{k}.csv", index=False)

pprint.pprint(file_categories)

NEON.D16.ABBY.DP1.10086.001.sls_soilMoisture.2016-10.expanded.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.sls_soilpH.2016-10.expanded.20221122T191800Z.csv
NEON.D16.ABBY.DP0.10086.001.validation.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.readme.20230127T120753Z.txt
NEON.D16.ABBY.DP1.10086.001.variables.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.sls_soilCoreCollection.2016-10.expanded.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.EML.20161004-20161006.20230127T120753Z.xml
NEON.D16.ABBY.DP0.10086.001.categoricalCodes.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.sls_soilMoisture.2016-10.basic.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.readme.20230127T120753Z.txt
NEON.D16.ABBY.DP1.10086.001.sls_soilpH.2016-10.basic.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.EML.20161004-20161006.20230127T120753Z.xml
NEON.D16.ABBY.DP0.10086.001.validation.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.variables.20221122T191800Z.csv
NEON.D16.ABBY.DP1.10086.001.sls_soilCor

In [None]:
# write_dicts_to_tsv(lod, "../local/soil_metagenome_filename_tokens.tsv")