In [None]:
import os, sys
sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib

In [2]:
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
import data_operations as dop
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc
import data_operations as dop
from pandasql import sqldf
from pprint import pprint
import pandas as pds
def pysqldf(q):
    return sqldf(q, globals())

In [3]:
spec_file = "../src/bin/lib/nmdc_data_source.yaml"
with open(spec_file, 'r') as input_file:
    spec = DottedDict(yaml.load(input_file, Loader=Loader))

In [4]:
## build merged dataframe from data sources specified in the spec file
# mdf = dop.make_dataframe_from_spec_file (spec_file, nrows=20) #  for testing grab 20 rows
# mdf = dop.make_dataframe_from_spec_file (spec_file, nrows=5) #  for testing grab 5 rows
# mdf = dop.make_dataframe_from_spec_file (spec_file)

## read data from saved file
mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str)
# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str, nrows=100)

In [5]:
# mdf.to_csv('../src/data/nmdc_merged_data.tsv', sep='\t', index=False) # save mergd data

## Test building study json

In [18]:
study_table = dop.extract_table(mdf, 'study_table')
contact_table = dop.extract_table(mdf, 'contact_table')
proposals_table = dop.extract_table(mdf, 'proposals_table')
project_table = dop.extract_table(mdf, 'project_table')
project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')
biosample_table = dop.extract_table(mdf, 'biosample_table')

In [7]:
study = dop.make_study_dataframe(study_table, contact_table, proposals_table)
study_dictdf = study.to_dict(orient="records") # transorm dataframe to dictionary

In [8]:
## specify attributes
attributes = \
    ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',
      'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id']

constructor = \
    {
        'id': 'gold_id',
        'name': 'study_name',
        'description': 'description'
    }

study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)

In [9]:
# print(json.dumps(json.loads(study_json_list[0]), indent=4)) ## peek at data
# print(nmdc.Study.class_class_curie)

## Test building project json

In [10]:
project = dop.make_project_dataframe(project_table, study_table, contact_table)
project_dictdf = project.to_dict(orient="records") # transorm dataframe to dictionary
# project.columns

In [11]:
## specify characteristics
attributes = \
    ['add_date', 'mod_date', 'completion_date', 'ncbi_project_name', 'omics_type', 'principal_investigator_name', 'processing_institution']

## if the constructor references an object (e.g., study)
## put object info in a tuple with cl, and class type second
constructor = \
    {
        'id': 'gold_id',
        'name': 'project_name',
        'description': 'description',
        'part_of': ({'id': 'study_gold_id'}, nmdc.Study)
    }

project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

## create list of json string objects
# project_json_list = dop.make_json_string_list\
#     (project_dictdf, nmdc.OmicsProcessing, id_key='gold_id', name_key='project_name', 
#      part_of_key="study_gold_id", description_key="description", attribute_fields=attributes)

In [12]:
print(json.dumps(json.loads(project_json_list[1]), indent=4)) ## peek at data

{
    "id": "Gp0108340",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 612S3M",
    "description": NaN,
    "part_of": [
        {
            "id": "Gs0112340",
            "nmdc_entity_type": "nmdc:Study"
        }
    ],
    "nmdc_entity_type": "nmdc:OmicsProcessing",
    "add_date": {
        "has_raw_value": "30-OCT-14 12.00.00.000000000 AM"
    },
    "mod_date": {
        "has_raw_value": "19-NOV-19 02.18.17.168000000 AM"
    },
    "ncbi_project_name": {
        "has_raw_value": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 612S3M"
    },
    "omics_type": {
        "has_raw_value": "Metagenome"
    },
    "principal_investigator_name": {
        "has_raw_value": "Virginia Rich"
    }
}


## Test building biosample json

In [19]:
biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)
biosample['lat_lon'] = biosample.apply(lambda row: dop.make_lat_lon(row.latitude, row.longitude), axis=1)
# biosample.lat_lon
biosample_dictdf = biosample.to_dict(orient="records") # transorm dataframe to dictionary
# biosample_dictdf[0] ## peek at data

{'nmdc_record_id': '108335',
 'add_date': '30-OCT-14 12.00.00.000000000 AM',
 'altitude': '0.0',
 'biogas_retention_time': nan,
 'biogas_temperature': nan,
 'biosample_id': '108335',
 'biosample_name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D',
 'chlorophyll_concentration': nan,
 'community': 'microbial communities',
 'depth': '0.0',
 'description': nan,
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_path_id': '4206',
 'ecosystem_subtype': 'Unclassified',
 'ecosystem_type': 'Soil',
 'geographic_location': 'Sweden: Kiruna',
 'gold_id': 'Gb0108335',
 'growth_temperature': nan,
 'habitat': 'Thawing permafrost',
 'host_name': nan,
 'identifier': 'studying carbon transformations',
 'latitude': '68.3534',
 'location': 'from the Arctic',
 'longitude': '19.0472',
 'mod_date': '20-APR-18 03.15.48.000000000 PM',
 'ncbi_taxonomy_name': 'permafrost metagenome',
 'nitrate_concentration': nan,
 'o

In [20]:
## specify attributes
attributes = \
    ['lat_lon',
     'add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'temperature_range',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'soil_annual_season_temp',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration'
    ]
## create dict of constructor args
constructor_map = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'lat_lon': 'lat_lon',
        'env_broad_scale': 'env_broad_scale',
        'env_local_scale': 'env_local_scale',
        'env_medium': 'env_medium'
    }

In [21]:
# len(biosample)

In [22]:
## create list of json string objects
biosample_json_list = dop.make_json_string_list\
    (biosample_dictdf, nmdc.Biosample, id_key='gold_id', name_key='biosample_name', 
     part_of_key="project_gold_ids", description_key="description", attribute_fields=attributes)

TypeError: make_json_string_list() got an unexpected keyword argument 'id_key'

In [None]:
# print(json.dumps(json.loads(biosample_json_list[0]), indent=4)) ## peek at data

## Test building EMSL data