## Translate GOLD study, project, and biosample data into json.
The notebooks demostrates how to translate study, project, and biosample data from the GOLD database into json that conforms with the [NMDC schema](https://github.com/microbiomedata/nmdc-metadata/blob/schema-draft/README.md).  
Before doing the translation it is important that you have an up to date `nmdc.py` file in the `lib` directory.  

The python modules for running the notebook are in the `requirements.txt` file.

In [1]:
import json
import pandas as pds
import jsonasobj
import lib.nmdc as nmdc
import lib.data_operations as dop
from pandasql import sqldf

def pysqldf(q):
    return sqldf(q, globals())

## Load tables (i.e., tab delimited files) from nmdc zip file
The NMDC data is currently stored in a zip file. Instead of unzipping the file, simply use the `zipfile` library to load the `study`, `project`, `project_biosample`, and `biosample` tables (stored as tab-delimited files). The `project_biosample` table is needed as a cross-linking table between `project` and `biosample`.  

The code for unzipping and creating the dataframe is found in the `make_dataframe` function. As part of the dataframe creation process, the column names are lower cased and spaces are replaced with underscored. I find it helpful to have some standarization on column names when doing data wrangling. This behavior can be overridden if you wish.

In [2]:
study = dop.make_dataframe("export.sql/STUDY_DATA_TABLE.dsv", file_archive_name="data/nmdc-version2.zip")
project = dop.make_dataframe("export.sql/PROJECT_DATA_TABLE.dsv", file_archive_name="data/nmdc-version2.zip")
project_biosample = dop.make_dataframe("export.sql/PROJECT_BIOSAMPLE_DATA_TABLE.dsv", file_archive_name="data/nmdc-version2.zip")
biosample = dop.make_dataframe("export.sql/BIOSAMPLE_DATA_TABLE.dsv", file_archive_name="data/nmdc-version2.zip")

In [3]:
# biosample.head() # peek at data

## Build study json

In [6]:
study_dictdf = study.to_dict(orient="records")

In [7]:
## print out a single record for viewing
# for record in study_dictdf:
#     print(json.dumps(record, indent=4)); break

In [8]:
characteristics = \
    ['gold_study_name', 'ecosystem', 'ecosystem_category', 'ecosystem_type', 
     'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id', 'add_date', 'mod_date']
study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, id_key='gold_id', name_key='study_name', description_key="description", characteristic_fields=characteristics)

In [19]:
# print(study_json_list[0]) ## peek at data

## Buid project json

In [10]:
q = """
select
    project.gold_id, project.project_name, project.description, project.add_date, 
    project.mod_date, project.completion_date, project.ncbi_project_name, group_concat(study.gold_id) as study_gold_ids
from 
    project
inner join study
    on study.study_id = project.master_study_id
group by
    project.add_date, project.mod_date, project.completion_date, project.ncbi_project_name
"""
projectdf = sqldf(q)

In [11]:
project_dictdf = projectdf.to_dict(orient="records")

In [12]:
characteristics = \
    ['add_date', 'mod_date', 'completion_date', 'ncbi_project_name']
project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.SequencingProject, id_key='gold_id', name_key='project_name', 
     part_of_key="study_gold_ids", description_key="description", characteristic_fields=characteristics)

In [20]:
# print(project_json_list[0]) ## peek at data

## Build biosample json

In [14]:
q = """
select
    biosample.gold_id,
    biosample.biosample_name,
    biosample.description,
    biosample.add_date,
    biosample.mod_date,
    biosample.ecosystem_path_id,
    biosample.ecosystem,
    biosample.ecosystem_category,
    biosample.ecosystem_type,
    biosample.ecosystem_subtype,
    biosample.specific_ecosystem,
    biosample.habitat,
    biosample.location,
    biosample.community,
    biosample.ncbi_taxonomy_name,
    biosample.geographic_location,
    biosample.latitude,
    biosample.longitude,
    biosample.sample_collection_site,
    biosample.identifier,
    biosample.sample_collection_year,
    biosample.sample_collection_month,
    biosample.sample_collection_day,
    biosample.sample_collection_hour,
    biosample.sample_collection_minute,
    biosample.host_name,
    biosample.depth,
    biosample.subsurface_depth,
    biosample.altitude,
    biosample.temperature_range,
    biosample.proport_woa_temperature,
    biosample.biogas_temperature,
    biosample.growth_temperature,
    biosample.soil_annual_season_temp,
    biosample.water_samp_store_temp,
    biosample.biogas_retention_time,
    biosample.salinity,
    biosample.pressure,
    biosample.ph,
    biosample.chlorophyll_concentration,
    biosample.nitrate_concentration,
    biosample.oxygen_concentration,
    biosample.salinity_concentration,
    group_concat(project.gold_id) as project_gold_ids
from
    biosample
inner join project_biosample
    on biosample.biosample_id = project_biosample.biosample_id
inner join project
    on project.project_id = project_biosample.project_id
group by
    biosample.biosample_id,
    biosample.biosample_name,
    biosample.description,
    biosample.add_date,
    biosample.mod_date,
    biosample.ecosystem_path_id,
    biosample.ecosystem,
    biosample.ecosystem_category,
    biosample.ecosystem_type,
    biosample.ecosystem_subtype,
    biosample.specific_ecosystem,
    biosample.habitat,
    biosample.location,
    biosample.community,
    biosample.ncbi_taxonomy_name,
    biosample.geographic_location,
    biosample.latitude,
    biosample.longitude,
    biosample.sample_collection_site,
    biosample.identifier,
    biosample.sample_collection_year,
    biosample.sample_collection_month,
    biosample.sample_collection_day,
    biosample.sample_collection_hour,
    biosample.sample_collection_minute,
    biosample.host_name,
    biosample.depth,
    biosample.subsurface_depth,
    biosample.altitude,
    biosample.temperature_range,
    biosample.proport_woa_temperature,
    biosample.biogas_temperature,
    biosample.growth_temperature,
    biosample.soil_annual_season_temp,
    biosample.water_samp_store_temp,
    biosample.biogas_retention_time,
    biosample.salinity,
    biosample.pressure,
    biosample.ph,
    biosample.chlorophyll_concentration,
    biosample.nitrate_concentration,
    biosample.oxygen_concentration,
    biosample.salinity_concentration
"""
biosampledf = sqldf(q)

In [15]:
biosample_dictdf = biosampledf.to_dict(orient="records")

In [16]:
characteristics = \
    ['add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'temperature_range',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'soil_annual_season_temp',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration'
    ]

In [17]:
biosample_json_list = dop.make_json_string_list\
    (biosample_dictdf, nmdc.Biosample, id_key='gold_id', name_key='biosample_name', 
     part_of_key="project_gold_ids", description_key="description", characteristic_fields=characteristics)

In [21]:
# print(biosample_json_list[0]) # peek at data