In [1]:
import pandas as pd

# use openpyxl for reading xlsx spreadsheet
# https://github.com/ontodev/cogs for google sheets? may assume single tab
import json
from math import isnan
import re
from collections import Counter
import uuid

from Bio import Entrez
import xmltodict

In [2]:
# could use entrez to retrieve addtional project details based on NCBI IDs
# initialize some default parameters
Entrez.email = "MAM@lbl.gov"  # provide your email address

# db = 'bioproject'
db = "dbvar"

paramEutils = {"usehistory": "Y"}  # Use Entrez search history to cache results

In [3]:
# list Entrez databases
record = Entrez.read(Entrez.einfo())
# record

## Provenance

saved https://docs.google.com/spreadsheets/d/1GZayIFIrY2jdoxRIpk9KDTBLiE71VVtb7YAd5ZSYGR0/edit#gid=0
to `input/Example-Soil_NMDC_SampleMetadata.xlsx` on 2021-07-13

Also 
- `Example-Soil_NMDC_SampleMetadata_202107201146.xlsx`


In [4]:
example_file = "../input/Example-Soil_NMDC_SampleMetadata_202107201146.xlsx"
mixs_spreadsheet_url = "https://github.com/GenomicsStandardsConsortium/mixs-legacy/blob/master/mixs5/mixs_v5.xlsx?raw=true"

# could also get MIxS knoweldge from NMDC schema... have been menaing to parse that anyway

In [5]:
example_frame_structure = pd.read_excel(example_file, sheet_name=None)
example_sheet_names = example_frame_structure.keys()
example_sheet_names

  warn(msg)


dict_keys(['ReadMe', 'ProjectInformation', 'AssociatedResearchers', 'Metadata', 'EnvironmentalMetadata', 'EcosystemTerms', 'MenuTerms'])

The "Metadata" sheet in the NMDC Sample Metadata template has four header rows and color coded blocks of columns. The green and red sections have fixed columns. Here, the second though fourth rows provide elaboration on the expected content. _Remember, Pandas interprets the very first row as a dataframe's headers by default._

These tabs will have to be parsed, too. `EnvironmentalMetadata` isn't populated in the example file yet.
- ProjectInformation
- AssociatedResearchers
- EnvironmentalMetadata

In [6]:
header_rows = 4

# column/color block realtions for `Metadata` tab

# "Sample Identification"
green_id_col = ["ID"]
additional_green_col_names = [
    "sample_name",
    "investigation_type",
    "package",
    "source_mat_ID",
]
green_cols = green_id_col + additional_green_col_names

# "Required for all samples"
red_col_names = [
    "growth_facil",
    "geo_loc_name",
    "lat_lon",
    "collection_date",
    "samp_mat_process",
    "store_cond",
    "samp_store_temp",
    "samp_size",
    "env_broad_scale",
    "env_local_scale",
    "env_medium",
    "gold_ecosystem",
    "microbiome_taxonomy",
]

## ProjectInformation

The project data are are organized in a single column

Transforming them here to a single-row dataframe, based on prior experience with metadata tab

In [7]:
example_projinf_frame = pd.read_excel(example_file, sheet_name="ProjectInformation")
example_projinf_frame = example_projinf_frame.transpose()
example_projinf_frame.columns = example_projinf_frame.iloc[0]
example_projinf_frame = example_projinf_frame.reset_index(drop=True)

# drop field names once they've become column names
example_projinf_frame = example_projinf_frame.drop(0)

# # drop blank left-most column
column_numbers = [
    x for x in range(example_projinf_frame.shape[1])
]  # list of columns' integer indices
column_numbers.remove(0)  # removing column integer index 0
example_projinf_frame = example_projinf_frame.iloc[
    :, column_numbers
]  # return all columns except the 0th column

example_projinf_frame

Study Metadata,Project/Study Name,Principal Investigator Name,Principal Investigator Email,Principal Investigator ORCiD,LinkOut Webpage,Project/Study Description,Associated Publications,Dataset DOIs,Type of samples,Alternative Names,EMSL Proposal/Study Number,GOLD Study ID,JGI Proposal ID,Umbrella Bio Project Name,Umbrella Bio Project ID
1,"""Soil microbial response to elevated temperatu...",Montana Smith,montana.smith@pnnl.gov,0000-0002-8683-0050,https://microbiomedata.org/ | https://github.c...,,https://doi.org/10.1016/j.soilbio.2019.107561 ...,https://doi.org/10.25585/1487765,soil,,EMSL:29728,GOLD:Gs01971387,JGI:1781,NCBI Accession: PRJNA594403,NCBI ID: 594403


In [8]:
proj_row_list = example_projinf_frame.to_dict("records")
proj_row_list
proj_obj = proj_row_list[0]
proj_obj

{'Project/Study Name': '"Soil microbial response to elevated temperatures and increased carbon availability"',
 'Principal Investigator Name': 'Montana Smith',
 'Principal Investigator Email': 'montana.smith@pnnl.gov',
 'Principal Investigator ORCiD': '0000-0002-8683-0050',
 'LinkOut Webpage': 'https://microbiomedata.org/ | https://github.com/microbiomedata',
 'Project/Study Description': nan,
 'Associated Publications': 'https://doi.org/10.1016/j.soilbio.2019.107561 | Keiser, Ashley D., et al. "Peatland microbial community response to altered climate tempered by nutrient availability." Soil Biology and Biochemistry 137 (2019): 107561.',
 'Dataset DOIs': 'https://doi.org/10.25585/1487765',
 'Type of samples': 'soil',
 'Alternative Names': nan,
 'EMSL Proposal/Study Number': 'EMSL:29728',
 'GOLD Study ID': 'GOLD:Gs01971387',
 'JGI Proposal ID': 'JGI:1781',
 'Umbrella Bio Project Name': 'NCBI Accession: PRJNA594403',
 'Umbrella Bio Project ID': 'NCBI ID: 594403'}

In [9]:
# value of having both BioProject ID and accession "name"
proj_bioproj_id = proj_obj["Umbrella Bio Project ID"]
handle = Entrez.esummary(db="bioproject", id="594403")
record = Entrez.read(handle)
handle.close()
# record

bioproj_record_json = json.dumps(record, indent=4, sort_keys=False, default=str)
# print(bioproj_record_json)

In [10]:
proj_uuid = uuid.uuid4().hex
proj_nmdc = {
    # which ID to use?
    # using a UUID as a placeholder
    "id": proj_uuid,
    # name, titel, dewcription, etc.?
    "name": proj_obj["Project/Study Name"],
    "description": proj_obj["Project/Study Description"],
    "principal_investigator": {
        "has raw value": proj_obj["Principal Investigator Name"],
        "orcid": proj_obj["Principal Investigator ORCiD"],
    },
    "type": "Study",
    "websites": re.split(" *\| *", proj_obj["LinkOut Webpage"]),
    "publications": re.split(" *\| *", proj_obj["Associated Publications"]),
    # does the template expect multiple DOIs?
    "doi": {
        "has raw value": proj_obj["Dataset DOIs"],
    },
}
proj_nmdc

{'id': 'f505e81e3d3b4e66bb5541ada3af77fa',
 'name': '"Soil microbial response to elevated temperatures and increased carbon availability"',
 'description': nan,
 'principal_investigator': {'has raw value': 'Montana Smith',
  'orcid': '0000-0002-8683-0050'},
 'type': 'Study',
 'websites': ['https://microbiomedata.org/',
  'https://github.com/microbiomedata'],
 'publications': ['https://doi.org/10.1016/j.soilbio.2019.107561',
  'Keiser, Ashley D., et al. "Peatland microbial community response to altered climate tempered by nutrient availability." Soil Biology and Biochemistry 137 (2019): 107561.'],
 'doi': {'has raw value': 'https://doi.org/10.25585/1487765'}}

### Project fields in template

- Project/Study Name
- Project/Study Description
- Alternative Names
- Principal Investigator Name
- Principal Investigator Email
- Principal Investigator ORCiD
- LinkOut Webpage
- Associated Publications
- Dataset DOIs
- EMSL Proposal/Study Number
- GOLD Study ID
- JGI Proposal ID
- Umbrella Bio Project Name
- Umbrella Bio Project ID
- Type of samples

Might have to use APIs like from Entrez to look up details for some of those IDs?

### Sample data from mongodb

```JSON
{
  "_id": {
    "$oid": "60e840cbe9822b255ad93845"
  },
  "id": "gold:Gs0103573",
  "name": "Populus root and rhizosphere microbial communities from Tennessee, USA",
  "description": "This study is part of the Plant-Microbe Interfaces Science Focus Area, which aims to gain a deeper understanding of the diversity and functioning of mutually beneficial interactions between plants and microbes in the rhizosphere. Ongoing efforts focus on characterizing and interpreting such interfaces using systems comprising plants and microbes, in particular the poplar tree (Populus) and its microbial community in the context of favorable plant microbe interactions.",
  "ecosystem": "Host-associated",
  "ecosystem_category": "Plants",
  "ecosystem_type": "Unclassified",
  "ecosystem_subtype": "Unclassified",
  "specific_ecosystem": "Unclassified",
  "principal_investigator": {
    "has_raw_value": "Dale Pelletier"
  },
  "type": "nmdc:Study",
  "websites": [
    "https://pmiweb.ornl.gov/pmi-project-aims/"
  ],
  "title": "Defining the functional diversity of the Populus root microbiome",
  "doi": {
    "has_raw_value": "https://doi.org/10.25585/1488096"
  },
  "publications": [
    "https://doi.org/10.1128/mSystems.00045-18"
  ]
}
```

### Alignment with schema

https://microbiomedata.github.io/nmdc-schema/Study.html

- which proposal, study, project ID to use in `id` slot?
- Project/Study Name -> `title`
- Project/Study Description -> `description`
- ? -> `name`
- `principal_investigator`
    - Principal Investigator Name -> `has_raw_value`
    - Principal Investigator ORCiD -> `orcid`
    - https://microbiomedata.github.io/nmdc-schema/PersonValue.html
- all `ecosystem...` fields missing from template?
- "Biosample" -> `type`



## AssociatedResearchers

In [11]:
example_assocs_frame = pd.read_excel(example_file, sheet_name="AssociatedResearchers")
example_assocs_frame

  warn(msg)


Unnamed: 0,Name,email,ORCiD,Role- CRediT
0,Pajau Vangay,pvangay@lbl.gov,0000-0002-9231-0692,Project administration


> /Users/MAM/Documents/gitrepos/nmdc-metadata/venv/lib/python3.9/site-packages/openpyxl/worksheet/_reader.py:312: UserWarning: Data Validation extension is not supported and will be removed
  warn(msg)

## `Metadata` tab

In [12]:
# example_sheet_names

In [13]:
example_metadata_frame = pd.read_excel(example_file, sheet_name="Metadata")

  warn(msg)


In [14]:
example_metadata_headers = example_metadata_frame.loc[
    0 : (header_rows - 2),
]
example_metadata_headers

Unnamed: 0,ID,sample_name,investigation_type,package,source_mat_ID,growth_facil,geo_loc_name,lat_lon,collection_date,samp_mat_process,...,treatment.3,samp_collect_device,size_fract,select,select.1,select.2,select.3,select.4,select.5,select.6
0,"Universally unique ID (ex: IGSN, UUID)",Sample name/Laboratory ID,Analysis/ Data Type,Sample Type,Source Material ID,Growth Facility,Geographical Location Name,latitude;longitude,Collection Date and Time,sample material processing,...,watering_regm,slide_hammer_core,sieving,select,select,select,select,select,select,select
1,Field REQUIRED for ALL sample submission. Opti...,Human readable ID. This is the ID that will be...,This field is constrained to contain only a se...,This field contains the type of sample type as...,A unique identifier assigned to an original ma...,Type of facility where the sample was collecte...,Detailed geographic location of sampling site,The geographical origin of the sample as defin...,"The time of sampling, either as an instance (s...",Any processing applied to the sample during or...,...,treatment- watering regimen/schedule,sample collection device-slide hammer corer,size fraction-sieve size,,,,,,,
2,{text},{text},{text};{text};{text},drop down selection,,drop down selection,{text},{float};{float},{YYYY-MM-DDTHH:MM:SS} or {YYYY-MM-DD},{text},...,{float} {unit};{Rn/start_time/end_time/duration},{text},{value}{units},,,,,,,


In [15]:
example_metadata_headers_cols = pd.Series(example_metadata_headers.columns)

In [16]:
# discard any column whose Pandas header starts with "select" possibly followed by a period and some digits
# these go up to column AH now (#34 when starting at 1)
# could that increase?

select_flag = example_metadata_headers_cols.str.match(
    "^select(\\.\\d+)?$", case=True, flags=0, na=None
)
select_cols = example_metadata_headers_cols[select_flag]

In [17]:
# select_cols

In [18]:
# all column names except for those that start with "select..."
keeper_cols = example_metadata_headers_cols[~select_flag]
# keeper_cols

In [19]:
example_metadata_content = example_metadata_frame.loc[
    3:,
]
example_metadata_content = example_metadata_content[list(keeper_cols)]

green_content = example_metadata_content[green_cols]

green_content

Unnamed: 0,ID,sample_name,investigation_type,package,source_mat_ID
3,UUID:472894-473947-847398,P4_-0-10_4C,genome; metabolome,plant_associated,
4,UUID:684267-410686-971057,L10_-20-30_13C-Gluc,transcriptome; metabolome,soil,UUID:472894-473947-847398
5,UUID:472894-473947-847396,6_J2_75-WHC,organic matter,soil,UUID:472894-473947-847398
6,UUID:516394-970067-847398,T4-35,lipidome; 16S-Amplicon; ITS-Amplicon,soil,
7,UUID:472894-473947-642384,2020-05-21_S19,proteome; genome,soil,UUID:516394-970067-847398


### Assumptions:
- lowercased column header of select.* means no data?
- **also check for non-NAs?**

In [20]:
# make sure no sample metadata slipped in here

select_content = example_metadata_frame[select_cols]
select_content

Unnamed: 0,select,select.1,select.2,select.3,select.4,select.5,select.6
0,select,select,select,select,select,select,select
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,
5,,,,,,,
6,,,,,,,
7,,,,,,,


### Be prepared to check template assumptions against MIxS expectations

In [21]:
mixs_spreadsheet = pd.read_excel(
    mixs_spreadsheet_url, sheet_name="MIxS", engine="openpyxl"
)
mixs_spreadsheet

investigation_type_row = mixs_spreadsheet[
    mixs_spreadsheet["Structured comment name"] == "investigation_type"
]

In [22]:
investigation_type_row["Expected value"].values

array(['eukaryote, bacteria_archaea, plasmid, virus, organelle, metagenome,mimarks-survey, mimarks-specimen, metatranscriptome, single amplified genome, metagenome-assembled genome, or uncultivated viral genomes'],
      dtype=object)

In [23]:
investigation_type_row["Value syntax"].values

array(['[eukaryote|bacteria_archaea|plasmid|virus|organelle|metagenome|metatranscriptome|mimarks-survey|mimarks-specimen|misag|mimag|miuvig]'],
      dtype=object)

In [24]:
# array?!
investigation_type_row["Occurence"].values

array([1], dtype=object)

In [25]:
green_row_list = green_content.to_dict("records")

# I made the column/slot associations by hand (search feature in nmdc schema docs web page)
#   would it been possible to do it by programtaticaly examining the schema?
#   or at least account for which populated columns hadn't been mapped
#   I'd like to open nmdc.yaml as linkedml and automatically include all imports
green_nmdc = [
    {
        "id": item["ID"],
        "name": item["sample_name"],
        "env_package": item["package"],
        "source_mat_id": item["source_mat_ID"],
        # investigation_type is not multi-valued according to mixs yet
        #   https://github.com/GenomicsStandardsConsortium/mixs-legacy/blob/master/mixs5/mixs_v5.xlsx
        #   see Occurence column
        #   eukaryote, bacteria_archaea, plasmid, virus, organelle, metagenome,mimarks-survey,
        #   mimarks-specimen, metatranscriptome, single amplified genome, metagenome-assembled genome, or uncultivated viral genomes
        # and has a smaller set of permissibles that are found in Montana's example data
        # todo: replace "; ?" with "|" but otherwise leave as a flast string
        # "investigation_type": re.split("; ?", item["investigation_type"]),
        # choices according to the template
        #   investigation_type
        #   Select Analyses
        #   genome
        #   transcriptome
        #   16S-Amplicon
        #   ITS-Amplicon
        #   18S-Amplicon
        #   proteome
        #   metabolome
        #   lipidome
        #   organic matter
        #   imaging- light
        #   imaging- electron
        #   imaging- ion
        #   chemical speciation/mapping
        #   molecular structure
        "investigation_type": re.sub(r"; *", "|", item["investigation_type"]),
        "type": "Biosample",
        "part_of": proj_uuid,
    }
    for item in green_row_list
]

In [26]:
red_content = example_metadata_content[green_id_col + red_col_names]

# red_content

In [27]:
red_row_list = red_content.to_dict("records")

red_nmdc = [
    {
        "id": item["ID"],
        # template's use of a nmdc/mixs recognized name
        #   doesn't mean that the data won't need any tidying
        "growth_facil": item["growth_facil"],
        "geo_loc_name": item["geo_loc_name"],
        "samp_mat_process": item["samp_mat_process"],
        "store_cond": item["store_cond"],
        "samp_store_temp": item["samp_store_temp"],
        "samp_size": item["samp_size"],
        "env_broad_scale": item["env_broad_scale"],
        "env_local_scale": item["env_local_scale"],
        "env_medium": item["env_medium"],
        "lat_lon": item["lat_lon"],
        # gold_ecosystem -> gold_path_field ?
        "gold_path_field": item["gold_ecosystem"],
        "collection_date": item["collection_date"],
        "ncbi_taxonomy_name": item["microbiome_taxonomy"],
    }
    for item in red_row_list
]

In [28]:
green_red = green_nmdc.copy()
for i in range(len(green_nmdc)):
    # get rid of NaNs
    green_red[i] = {k: v for k, v in green_red[i].items() if v == v}
    red_temp = red_nmdc[i]
    clean_dict = {k: v for k, v in red_temp.items() if v == v}
    green_red[i].update(clean_dict)

In [29]:
# default=str for Object of type datetime is not JSON serializable on collection_date
green_red_json = json.dumps(green_red, indent=4, sort_keys=False, default=str)

# print(green_red_json)

The columns and semantics of the blue "Required where applicable" section are variable. Users are supposed to fill in data required by the environmental packages describing their samples? All four of the header rows may need to be parsed/interpreted in order to convert the data rows into NMDC JSON data objects and record corresponding units.

The blue section is made up of all non "select" columns minus the green and red cols, or all columns at postion S (20, starting from 1) or greater?

In [30]:
blue_cols = list((set(keeper_cols) - set(green_cols)) - set(red_col_names))
blue_content = example_metadata_content[green_id_col + blue_cols]
blue_content

Unnamed: 0,ID,treatment.1,samp_collect_device,elev,depth.1,depth,treatment.2,treatment,treatment.3,size_fract
3,UUID:472894-473947-847398,,"11/2' x 6""; stainless stee",500,,0-1,,,,4mm
4,UUID:684267-410686-971057,1000µg of C (glucose)/g soil,"11/2' x 6""; stainless stee",500,,0-1,13C Glucose,,,2mm
5,UUID:472894-473947-847396,1000µg of C (glucose)/g soil,"11/2' x 6""; stainless stee",500,,0-1,,,75% water holding capacity; moisture maintaine...,2mm
6,UUID:516394-970067-847398,,"11/2' x 6""; stainless stee",1000,-20-30,,,35C,,0.5-.25mm
7,UUID:472894-473947-642384,,"11/2' x 6""; stainless stee",1000,-20-30,,,,,1-.05mm


In [31]:
blue_headers = example_metadata_headers[blue_cols]
blue_headers

Unnamed: 0,treatment.1,samp_collect_device,elev,depth.1,depth,treatment.2,treatment,treatment.3,size_fract
0,chem_administration,slide_hammer_core,m,m,cm,isotope_exposure,air_temp_regm,watering_regm,sieving
1,treatment-chemical administration/addition,sample collection device-slide hammer corer,elevation-meters,depth-meters,depth-centimeters,treatment-isotope exposure/addition,treatment-air temperature regimen,treatment- watering regimen/schedule,size fraction-sieve size
2,{termLabel} {[termID]}; {timestamp},{text},{value} meters,{value} meters,{value} centimeters,{text},{float} {unit};{Rn/start_time/end_time/duration},{float} {unit};{Rn/start_time/end_time/duration},{value}{units}


## template's `size_fract` = schema's `size_frac`

For domain and range, NMDC schema says `None →` _OPT_ `QuantityValue`

mongodb query `{ "size_frac": { $exists: true }}` against the `biosample_set` collection returns 0 documents

for reference, `{ "depth": { $exists: true }}` **does** return hits

The template says that size_fract can? must? be used with
- sediment
- soil
- water

The size_fract details must? be one of 
- filter_size	{value}{units}
- filter_type	{text}
- sieving	{value}{units}

The following columns are also allowed? by the template for water package samples

```
size_frac_low	size-fraction lower threshold	{float} {unit}
size_frac_up	size-fraction upper threshold	{float} {unit}
```

## `treatment`

There can be multiple treatment columns. Google Sheets allows multiple column headers with the same value, but Pandas adds numeric suffixes to make them unique.

The schema doesn't have a treatment class or slot. Rather, schema defined terms can be found in row 0 of the headers frame.



## `elev`

`alt` also allowed? Intended usage? Neither appears in mongodb yet

template specifies meters

## `samp_collect_device`

range = (optionally language-typed) TextValue

No mongodb hits

- sediment
- soil
- water

```
shovel	{text}
slide_hammer_core	{text}
deep_corer	{text}
russian_corer	{text}
sipper	{text}
other-samp_collect_device	{text}
```

In [32]:
# todo: illustrate use of annotator

In [33]:
# what if there are multiple depth columns with different units?

# which slots can be multivalued and therefore take multiple columns?
# they may or may not have suffixes like treatment, treatment.1, treatment.2

blue_row_list = blue_content.to_dict("records")

# any one of these columns could be missing
# how to avoid KeyErrors?

blue_nmdc = [
    {
        "id": item["ID"],
        ## size_frac, not size_fract
        # "size_frac": item["size_fract"],
        # "samp_collect_device": item["samp_collect_device"],
        ## plant_struc, host_taxid, host_common_name are valid for XXX package/investigation type
        ## but doesn't appear in this soil sample data
        # "plant_struc": item["plant_struc"],
        # "host_taxid": item["host_taxid"],
        # "host_common_name": item["host_common_name"],
        # https://microbiomedata.github.io/nmdc-schema/QuantityValue.html
        "depth": {
            "has_raw_value": item["depth"],
            "has_unit": example_metadata_headers.loc[0, "depth"],
        },
        "elev": {
            "has_raw_value": item["elev"],
            "has_unit": example_metadata_headers.loc[0, "elev"],
        },
    }
    for item in blue_row_list
]

# blue_nmdc

# blue_json = json.dumps(blue_nmdc, indent=4, sort_keys=False, default=str)

# print(blue_json)

In [34]:
#  REFACTOR

green_red_blue = green_red.copy()
for i in range(len(green_nmdc)):
    ## NaNs should have already been removed
    ##   but only for atomic values. Didn't do any deep search.
    # green_red[i] = {k: v for k, v in green_red[i].items() if v == v}
    blue_temp = blue_nmdc[i]
    clean_dict = {k: v for k, v in blue_temp.items() if v == v}
    green_red_blue[i].update(clean_dict)

green_red_blue_json = json.dumps(green_red_blue, indent=4, sort_keys=False, default=str)

In [35]:
proj_json = json.dumps(proj_nmdc, indent=4, sort_keys=False, default=str)
print(proj_json)

{
    "id": "f505e81e3d3b4e66bb5541ada3af77fa",
    "name": "\"Soil microbial response to elevated temperatures and increased carbon availability\"",
    "description": NaN,
    "principal_investigator": {
        "has raw value": "Montana Smith",
        "orcid": "0000-0002-8683-0050"
    },
    "type": "Study",
    "websites": [
        "https://microbiomedata.org/",
        "https://github.com/microbiomedata"
    ],
    "publications": [
        "https://doi.org/10.1016/j.soilbio.2019.107561",
        "Keiser, Ashley D., et al. \"Peatland microbial community response to altered climate tempered by nutrient availability.\" Soil Biology and Biochemistry 137 (2019): 107561."
    ],
    "doi": {
        "has raw value": "https://doi.org/10.25585/1487765"
    }
}


In [36]:
print(green_red_blue_json)

[
    {
        "id": "UUID:472894-473947-847398",
        "name": "P4_-0-10_4C",
        "env_package": "plant_associated",
        "investigation_type": "genome|metabolome",
        "type": "Biosample",
        "part_of": "f505e81e3d3b4e66bb5541ada3af77fa",
        "growth_facil": "field",
        "geo_loc_name": "USA; Prosser, Washington; Washington State University-Irrigated Agriculture Research and Extension Center",
        "samp_mat_process": "snap freeze in liquid N",
        "store_cond": "fresh",
        "samp_store_temp": "4 degree Celsius",
        "env_broad_scale": "arid biome",
        "env_local_scale": "agricultural field",
        "env_medium": "agricultural soil",
        "lat_lon": "46.251709, -119.728663",
        "gold_path_field": "5424:Environmental:Terrestrial:Soil:Bulk soil:Agricultural land",
        "collection_date": "2020-05-21T12:00:00",
        "ncbi_taxonomy_name": "soil metagenome",
        "depth": {
            "has_raw_value": "0-1",
            "ha