In [1]:
import os, sys
sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to data_opertations.py
sys.path.append(os.path.abspath('../../schema/')) # add path nmdc.py

In [2]:
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
import data_operations as dop
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc
import data_operations as dop
from pandasql import sqldf
from pprint import pprint
import pandas as pds
import jsonasobj

from pandasql import sqldf
def pysqldf(q):
    return sqldf(q, globals())

## Load yaml spec for data sources

In [3]:
spec_file = "../src/bin/lib/nmdc_data_source.yaml"
with open(spec_file, 'r') as input_file:
    spec = DottedDict(yaml.load(input_file, Loader=Loader))

## Create merged dataframe of all data sources

In [4]:
## build merged dataframe from data sources specified in the spec file
# mdf = dop.make_dataframe_from_spec_file (spec_file)
# mdf.to_csv('../src/data/nmdc_merged_data.tsv', sep='\t', index=False) # save mergd data

## Load data from merged tsv (this can only be done after merged data tsv has been created)

In [5]:
# read data from saved file
mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str)
# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str, nrows=100)

In [6]:
mdf.nmdc_data_source.unique() ## list of the data sources in merged

array(['study_table', 'contact_table', 'proposals_table', 'project_table',
       'project_biosample_table', 'biosample_table', 'ficus_faa_table',
       'ficus_fna_table', 'ficus_fastq_table', 'ficus_jgi_emsl',
       'ficus_emsl'], dtype=object)

## Extract tables from merged dataset

In [7]:
study_table = dop.extract_table(mdf, 'study_table')
contact_table = dop.extract_table(mdf, 'contact_table')
proposals_table = dop.extract_table(mdf, 'proposals_table')
project_table = dop.extract_table(mdf, 'project_table')
jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')
emsl_table = dop.extract_table(mdf, 'ficus_emsl')
faa_table = dop.extract_table(mdf, 'ficus_faa_table')
fna_table = dop.extract_table(mdf, 'ficus_fna_table')
fastq_table = dop.extract_table(mdf, 'ficus_fastq_table')
project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')
biosample_table = dop.extract_table(mdf, 'biosample_table')
# biosample_table.columns

## Test building study json

In [8]:
study = dop.make_study_dataframe(study_table, contact_table, proposals_table)
study_dictdf = study.to_dict(orient="records") # transorm dataframe to dictionary
# study.gold_id

In [9]:
## specify attributes
attributes = \
    ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',
      'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']

constructor = \
    {
        'id': 'gold_id',
        'name': 'study_name',
        'description': 'description'
    }

study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)

In [10]:
# print(json.dumps(json.loads(study_json_list[0]), indent=4)) ## peek at data
# print(nmdc.Study.class_class_curie)
# len(study)

## Save study output

In [11]:
dop.save_json_string_list("output/test-pipeline/gold_study.json", study_json_list) # save json string list to file

## Test building EMSL omics processing projects

In [12]:
emsl = dop.make_emsl_dataframe(emsl_table, jgi_emsl_table, study_table)
emsl_dictdf = emsl.to_dict(orient="records") # transorm dataframe to dictionary

In [13]:
# emsl.gold_study_id

In [14]:
# len(emsl)
# emsl.head()
# emsl.columns
# len(emsl_table)

In [15]:
attributes = \
    [
      #'file_size_bytes',
      # {'part_of': ({'id': 'gold_study_id'}, nmdc.Study)},
      # {'has_output': ({'id': 'data_object_id'}, nmdc.DataObject)}
      {'part_of': 'gold_study_id'},
      {'has_output': 'data_object_id'}
    ]

constructor = \
    {
        'id': 'dataset_id',
        'name': 'dataset_name',
        'description': 'dataset_type_description'
    }

emsl_project_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [16]:
# print(json.dumps(json.loads(emsl_project_json_list[0]), indent=4)) ## peek at data

## Save EMSL omics processing projects

In [17]:
dop.save_json_string_list("output/test-pipeline/emsl_omics_processing.json", emsl_project_json_list) # save json string list to file

## Test build EMSL data objects

In [18]:
attributes = \
    [
      'file_size_bytes'
    ]

constructor = \
    {
        'id': 'data_object_id',
        'name': 'data_object_name',
        'description': 'dataset_type_description'
    }

emsl_data_object_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [19]:
# print(json.dumps(json.loads(emsl_data_object_json_list[0]), indent=4)) ## peek at data

## Save EMSL data objects

In [20]:
dop.save_json_string_list("output/test-pipeline/emsl_data_objects.json", emsl_data_object_json_list) # save json string list to file

## Test building data obects (faa, fna, fastq)

In [21]:
# fastq_table.head() # peek at data

In [22]:
data_objects = dop.make_data_objects_dataframe(faa_table, fna_table, fastq_table, project_table)
data_objects_dictdf = data_objects.to_dict(orient="records") # transorm dataframe to dictionary

In [23]:
len(data_objects)

3001

In [24]:
attributes = \
    [
      'file_size_bytes'
    ]

constructor = \
    {
        'id': 'file_id',
        'name': 'file_name',
        'description': 'file_type_description'
    }

data_objects_json_list = dop.make_json_string_list\
    (data_objects_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [25]:
# print(json.dumps(json.loads(data_objects_json_list[0]), indent=4)) ## peek at data

## Save faa, fna, fastq data objects

In [26]:
dop.save_json_string_list("output/test-pipeline/faa_fna_fastq_data_objects.json", data_objects_json_list) # save json string list to file

## Test building GOLD project json

In [27]:
# data_objects.head()

In [28]:
project = dop.make_project_dataframe(project_table, study_table, contact_table, data_objects)
# project[pds.isnull(project.output_file_ids)]
# project = project[project.nmdc_record_id == "115128"] # test if output_file_ids is null
# project.output_file_ids.unique()
# project.output_file_ids

In [29]:
project_dictdf = project.to_dict(orient="records") # transorm dataframe to dictionary
# project.columns

In [30]:
## specify characteristics
attributes = \
    [
      # {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},
      # {'has_output': ({'id': 'output_file_ids'}, nmdc.DataObject)},
      {'part_of': 'study_gold_id'},
      {'has_output': 'output_file_ids'},
      'add_date', 
      'mod_date', 
      'completion_date', 
      'ncbi_project_name', 
      'omics_type', 
      'principal_investigator_name',
      'processing_institution'
    ]


constructor = \
    {
        'id': 'gold_id',
        'name': 'project_name',
        'description': 'description'
    }

project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [31]:
# print(json.dumps(json.loads(project_json_list[0]), indent=4)) ## peek at data

## Save output

In [32]:
dop.save_json_string_list("output/test-pipeline/gold_omics_processing.json", project_json_list) # save json string list to file

## Test building biosample json

In [33]:
biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)

In [34]:
biosample_dictdf = biosample.to_dict(orient="records") # transorm dataframe to dictionary
# biosample_dictdf[0] ## peek at dict data

In [35]:
## specify attributes
attributes = \
  [
    'add_date',
    'mod_date',
    'collection_date',
    'ecosystem',
    'ecosystem_dcategory',
    'ecosystem_type',
    'ecosystem_subtype',
    'specific_ecosystem',
    'habitat',
    'location',
    'community',
    'ncbi_taxonomy_name',
    'geographic_location',
    'sample_collection_site',
    'identifier',
    'host_name',
    'depth',
    'subsurface_depth',
    'altitude',
    'proport_woa_temperature',
    'biogas_temperature',
    'growth_temperature',
    'water_samp_store_temp',
    'biogas_retention_time',
    'salinity',
    'pressure',
    'ph',
    'chlorophyll_concentration',
    'nitrate_concentration',
    'oxygen_concentration',
    'salinity_concentration',
    'sample_volume',
    'sample_weight_dna_ext',
    'sampling_strategy',
    'soil_link_climate_info',
    'soil_misc_param',
    'soil_misc_param ',
    'soil_water_content',
    'soluble_iron_micromol',
    'subsurface_depth2',
    'tot_nitrogen',
    'tot_org_carbon',
    'water_alkalinity',
    'water_alkalinity_method',
    'water_alkyl_diethers',
    'water_aminopept_act',
    'water_ammonium',
    'water_bacterial_carbon_prod',
    'water_bishomohopanol',
    'water_bromide',
    'water_calcium',
    'water_carbon_nitrog_ratio',
    'water_chem_administration',
    'water_chloride',
    'water_density',
    'water_diether_lipids',
    'water_diss_carbon_dioxide',
    'water_diss_hydrogen',
    'water_diss_inorg_carbon',
    'water_diss_inorg_phosphorus',
    'water_diss_org_carbon',
    'water_diss_org_nitrogen',
    'water_glucosidase_activity',
    'water_magnesium',
    'water_mean_frict_vel',
    'water_mean_peak_frict_vel',
    'water_misc_parameter',
    'water_n_alkanes',
    'water_nitrite',
    'water_org_matter',
    'water_org_nitrogen',
    'water_organism_count',
    'water_oxy_stat_sample',
    'water_part_org_carbon',
    'water_perturbation',
    'water_petroleum_hydrocarbon',
    'water_phaeopigments',
    'water_phosplipid_fatt_acid',
    'water_potassium',
    'water_redox_potential',
    'water_samp_store_dur',
    'water_samp_store_loc',
    'water_size_frac_low',
    'water_size_frac_up',
    'water_sodium',
    'water_sulfate',
    'water_sulfide',
    'water_tidal_stage',
    'water_tot_depth_water_col',
    'water_tot_diss_nitro',
    'water_tot_phosphorus',
    'water_turbidity',
    {'part_of': 'project_gold_ids'}
    # {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
  ]

# removed in version 5: 'temperature_range', 'soil_annual_season_temp'

In [36]:
# os.chdir('../..')
# os.getcwd()

In [37]:
## create map betweeen gold fields and mixs terms
mapping_df = dop.make_dataframe("../src/data/GOLD-to-mixs-map.tsv")
attr_map = dop.make_gold_to_mixs_map(attributes, mapping_df, 'biosample')

In [38]:
## create dict of constructor args
constructor = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'env_broad_scale': [{'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue],
        'env_local_scale': [{'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue],
        'env_medium': [{'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue],
        'lat_lon': [{'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue],
    }

In [39]:
## create list of json string objects
biosample_json_list = dop.make_json_string_list \
    (biosample_dictdf, nmdc.Biosample, constructor_map=constructor, attribute_fields=attributes, attribute_map=attr_map)

In [40]:
# print(json.dumps(json.loads(biosample_json_list[0]), indent=4)) ## peek at data

## Save output

In [41]:
dop.save_json_string_list("output/test-pipeline/biosample.json", biosample_json_list) # save json string list to file

## Test subset of output

In [42]:
## navigate to test output directory
os.chdir('output/test-pipeline/')

In [43]:
os.getcwd()

'/Users/wdduncan/repos/NMDC/nmdc-metadata/metadata-translation/notebooks/output/test-pipeline'

In [44]:
## grab first five biosamples
!jq '.[0:4]' biosample.json > '../test-five-biosamples/biosample.json'

In [45]:
biosample_set = None
with open('../test-five-biosamples/biosample.json', 'r') as f:
    biosample_set = json.load(f)

In [46]:
## find first 5 project ids of biosamples
# !jq -c '.[] | {biosample:.id, project:.part_of[]}' biosample.json | head -n5
!jq '.[] | .part_of[]' biosample.json | head -n5

"gold:Gp0108335"
"gold:Gp0108340"
"gold:Gp0108341"
"gold:Gp0108342"
"gold:Gp0108344"


In [47]:
## create project subset
!jq '.[] | select(.id == ("gold:Gp0108335", "gold:Gp0108340", "gold:Gp0108341", "gold:Gp0108342", "gold:Gp0108344"))' \
gold_omics_processing.json \
| jq --slurp '.' \
> '../test-five-biosamples/project.json'

In [48]:
project_set = None
with open('../test-five-biosamples/project.json', 'r') as f:
    project_set = json.load(f)

In [49]:
## get the study ids
!jq '.[] | .part_of[]' ../test-five-biosamples/project.json

[0;32m"gold:Gs0112340"[0m
[0;32m"gold:Gs0112340"[0m
[0;32m"gold:Gs0112340"[0m
[0;32m"gold:Gs0112340"[0m
[0;32m"gold:Gs0112340"[0m


In [50]:
## create study subset
!jq '.[] | select(.id == "gold:Gs0112340")' \
gold_study.json \
| jq --slurp '.' \
> '../test-five-biosamples/study.json'

In [51]:
study_set = None
with open('../test-five-biosamples/study.json', 'r') as f:
    study_set = json.load(f)

In [52]:
## get outputs of projects
!jq '.[] | .has_output[]' ../test-five-biosamples/project.json

[0;32m"nmdc:5af44fd364d0b33747747ddb"[0m
[0;32m"nmdc:5af44fd264d0b33747747dd9"[0m
[0;32m"jgi:551a20d30d878525404e90d5"[0m
[0;32m"nmdc:5af0d91764d0b3374773e07a"[0m
[0;32m"nmdc:5af0d91764d0b3374773e078"[0m
[0;32m"jgi:551a20d50d878525404e90d7"[0m
[0;32m"nmdc:5af6f6bd64d0b3374774f9a7"[0m
[0;32m"nmdc:5af6f6bc64d0b3374774f9a5"[0m
[0;32m"jgi:551a20d90d878525404e90e1"[0m
[0;32m"nmdc:5af0d80364d0b3374773e066"[0m
[0;32m"nmdc:5af0d80264d0b3374773e064"[0m
[0;32m"jgi:551a20d60d878525404e90d9"[0m
[0;32m"nmdc:5af65c0864d0b3374774e587"[0m
[0;32m"nmdc:5af65c0764d0b3374774e559"[0m
[0;32m"jgi:551a20da0d878525404e90e4"[0m


In [53]:
## create data objects subset
!jq '.[] | select(.id == ("nmdc:5af44fd364d0b33747747ddb", "nmdc:5af44fd264d0b33747747dd9", "jgi:551a20d30d878525404e90d5", "nmdc:5af0d91764d0b3374773e07a", "nmdc:5af0d91764d0b3374773e078", "jgi:551a20d50d878525404e90d7", "nmdc:5af6f6bd64d0b3374774f9a7", "nmdc:5af6f6bc64d0b3374774f9a5", "jgi:551a20d90d878525404e90e1", "nmdc:5af0d80364d0b3374773e066", "nmdc:5af0d80264d0b3374773e064", "jgi:551a20d60d878525404e90d9", "nmdc:5af65c0864d0b3374774e587", "nmdc:5af65c0764d0b3374774e559", "jgi:551a20da0d878525404e90e4"))' \
faa_fna_fastq_data_objects.json \
| jq --slurp '.' \
> '../test-five-biosamples/data_object.json'

In [54]:
data_object_set = None
with open('../test-five-biosamples/data_object.json', 'r') as f:
    data_object_set = json.load(f)

In [55]:
!ls

biosample.json                  gold_omics_processing.json
emsl_data_objects.json          gold_study.json
emsl_omics_processing.json      nmdc-02.json
faa_fna_fastq_data_objects.json


In [56]:
## emsl projects
# !jq '.[0]' emsl_omics_processing.json 

In [57]:
# !jq '.[0]' emsl_data_objects.json

In [58]:
database = \
    {
      "study_set": [*study_set], 
      "omics_processing_set": [*project_set], 
      "biosample_set": [*biosample_set], 
      "data_object_set": [*data_object_set]
    }

In [59]:
with open('nmdc-02.json', 'w') as fp:
    json.dump(database, fp)

In [60]:
!ls

biosample.json                  gold_omics_processing.json
emsl_data_objects.json          gold_study.json
emsl_omics_processing.json      nmdc-02.json
faa_fna_fastq_data_objects.json


In [61]:
!pwd

/Users/wdduncan/repos/NMDC/nmdc-metadata/metadata-translation/notebooks/output/test-pipeline


In [62]:
# !jq '.' nmdc-02.json | head -n100