In [1]:
import os, sys
sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib

In [2]:
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
import data_operations as dop
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc
import data_operations as dop
from pandasql import sqldf
from pprint import pprint
import pandas as pds
import jsonasobj
def pysqldf(q):
    return sqldf(q, globals())

In [3]:
spec_file = "../src/bin/lib/nmdc_data_source.yaml"
with open(spec_file, 'r') as input_file:
    spec = DottedDict(yaml.load(input_file, Loader=Loader))

In [4]:
## build merged dataframe from data sources specified in the spec file
# mdf = dop.make_dataframe_from_spec_file (spec_file, nrows=20) #  for testing grab 20 rows
# mdf = dop.make_dataframe_from_spec_file (spec_file, nrows=5) #  for testing grab 5 rows
# mdf = dop.make_dataframe_from_spec_file (spec_file)

# read data from saved file
mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str)
# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str, nrows=100)

In [5]:
# mdf.to_csv('../src/data/nmdc_merged_data.tsv', sep='\t', index=False) # save mergd data

## Test building study json

In [6]:
study_table = dop.extract_table(mdf, 'study_table')
contact_table = dop.extract_table(mdf, 'contact_table')
proposals_table = dop.extract_table(mdf, 'proposals_table')
project_table = dop.extract_table(mdf, 'project_table')
project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')
biosample_table = dop.extract_table(mdf, 'biosample_table')
# biosample_table.columns

In [7]:
study = dop.make_study_dataframe(study_table, contact_table, proposals_table)
study_dictdf = study.to_dict(orient="records") # transorm dataframe to dictionary

In [8]:
## specify attributes
attributes = \
    ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',
      'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id']

constructor = \
    {
        'id': 'gold_id',
        'name': 'study_name',
        'description': 'description'
    }

study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)

In [9]:
# print(json.dumps(json.loads(study_json_list[0]), indent=4)) ## peek at data
# print(nmdc.Study.class_class_curie)

## Test building project json

In [10]:
project = dop.make_project_dataframe(project_table, study_table, contact_table)
project_dictdf = project.to_dict(orient="records") # transorm dataframe to dictionary
# project.columns

In [11]:
## specify characteristics
attributes = \
    [
      # {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},
      'add_date', 
      'mod_date', 
      'completion_date', 
      'ncbi_project_name', 
      'omics_type', 
      'principal_investigator_name',
      'processing_institution'
    ]

## if the constructor references an object (e.g., study)
## put object info in a tuple with cl, and class type second
constructor = \
    {
        'id': 'gold_id',
        'name': 'project_name',
        'description': 'description'
    }

project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

## create list of json string objects
# project_json_list = dop.make_json_string_list\
#     (project_dictdf, nmdc.OmicsProcessing, id_key='gold_id', name_key='project_name', 
#      part_of_key="study_gold_id", description_key="description", attribute_fields=attributes)

In [12]:
print(json.dumps(json.loads(project_json_list[1]), indent=4)) ## peek at data

{
    "id": "Gp0108340",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 612S3M",
    "description": NaN,
    "type": "nmdc:OmicsProcessing",
    "add_date": {
        "has_raw_value": "30-OCT-14 12.00.00.000000000 AM"
    },
    "mod_date": {
        "has_raw_value": "22-MAY-20 06.10.59.590000000 PM"
    },
    "ncbi_project_name": {
        "has_raw_value": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 612S3M"
    },
    "omics_type": {
        "has_raw_value": "Metagenome"
    },
    "principal_investigator_name": {
        "has_raw_value": "Virginia Rich"
    }
}


## Test building biosample json

In [13]:
biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)
biosample['lat_lon'] = biosample.apply(lambda row: dop.make_lat_lon(row.latitude, row.longitude), axis=1)
# biosample.lat_lon
biosample_dictdf = biosample.to_dict(orient="records") # transorm dataframe to dictionary
biosample_dictdf[0] ## peek at data

{'nmdc_record_id': '108335',
 'add_date': '30-OCT-14 12.00.00.000000000 AM',
 'altitude': nan,
 'biogas_retention_time': nan,
 'biogas_temperature': nan,
 'biosample_id': '108335',
 'biosample_name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D',
 'chlorophyll_concentration': nan,
 'community': 'microbial communities',
 'depth': '0.0',
 'description': nan,
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_path_id': '4234',
 'ecosystem_subtype': 'Wetlands',
 'ecosystem_type': 'Soil',
 'env_broad_scale': 'ENVO_00000446',
 'env_local_scale': 'ENVO_00000489',
 'env_medium': 'ENVO_00000134',
 'geographic_location': 'Sweden: Kiruna',
 'gold_id': 'Gb0108335',
 'growth_temperature': nan,
 'habitat': 'Thawing permafrost',
 'host_name': nan,
 'identifier': 'studying carbon transformations',
 'latitude': '68.3534',
 'location': 'from the Arctic',
 'longitude': '19.0472',
 'mod_date': '15-MAY-20 10.04.

In [14]:
## specify attributes
attributes = \
    ['lat_lon',
     'add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration',
     {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    ]

# removed in version 5: 'temperature_range', 'soil_annual_season_temp'

## create dict of constructor args
constructor = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        #'env_broad_scale': 'env_broad_scale',
        #'env_local_scale': 'env_local_scale',
        #'env_medium': 'env_medium',
        'env_broad_scale': ({'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue),
        'env_local_scale': ({'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue),
        'env_medium': ({'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue),
        'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),
        #'part_of': {'id': 'project_gold_ids', 'nmdc_entity_type': nmdc.OmicsProcessing.class_class_curie}
    }

# 'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),

In [15]:
len(biosample)
# biosample.columns
# biosample.lat_lon

964

In [16]:
## create list of json string objects
biosample_json_list = dop.make_json_string_list\
    (biosample_dictdf, nmdc.Biosample, constructor_map=constructor, attribute_fields=attributes)

# biosample_json_list = dop.make_json_string_list\
#     (biosample_dictdf, nmdc.Biosample, id_key='gold_id', name_key='biosample_name', 
#      part_of_key="project_gold_ids", description_key="description", attribute_fields=attributes)

In [17]:
print(json.dumps(json.loads(biosample_json_list[0]), indent=4)) ## peek at data

{
    "id": "Gb0108335",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D",
    "description": NaN,
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472",
        "latitude": "68.3534",
        "longitude": "19.0472",
        "type": "nmdc:GeolocationValue"
    },
    "env_broad_scale": {
        "has_raw_value": "ENVO_00000446",
        "type": "nmdc:ControlledTermValue"
    },
    "env_local_scale": {
        "has_raw_value": "ENVO_00000489",
        "type": "nmdc:ControlledTermValue"
    },
    "env_medium": {
        "has_raw_value": "ENVO_00000134",
        "type": "nmdc:ControlledTermValue"
    },
    "ecosystem": {
        "has_raw_value": "Environmental"
    },
    "ecosystem_category": {
        "has_raw_value": "Terrestrial"
    },
    "ecosystem_type": {
        "has_raw_value": "Soil"
    },
    "ecosystem_subtype": {
        "has_raw_value": "Wetlands"
    },
    "specific_ecosystem": {
       

## Test building EMSL data

In [18]:
# tmap = \
# {
#  'nmdc_record_id': '108335',
#  'add_date': '30-OCT-14 12.00.00.000000000 AM',
#  'altitude': None,
#  'biogas_retention_time': None,
#  'biogas_temperature': None,
#  'biosample_id': '108335',
#  'biosample_name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D',
#  'chlorophyll_concentration': None,
#  'community': 'microbial communities',
#  'depth': '0.0',
#  'description': None,
#  'ecosystem': 'Environmental',
#  'ecosystem_category': 'Terrestrial',
#  'ecosystem_path_id': '4234',
#  'ecosystem_subtype': 'Wetlands',
#  'ecosystem_type': 'Soil',
#  'env_broad_scale': 'ENVO_00000446',
#  'env_local_scale': 'ENVO_00000489',
#  'env_medium': 'ENVO_00000134',
#  'geographic_location': 'Sweden: Kiruna',
#  'gold_id': 'Gb0108335',
#  'growth_temperature': None,
#  'habitat': 'Thawing permafrost',
#  'host_name': None,
#  'identifier': 'studying carbon transformations',
#  'latitude': '68.3534',
#  'location': 'from the Arctic',
#  'longitude': '19.0472',
#  'mod_date': '15-MAY-20 10.04.19.473000000 AM',
#  'ncbi_taxonomy_name': 'permafrost metagenome',
#  'nitrate_concentration': None,
#  'oxygen_concentration': None,
#  'ph': None,
#  'pressure': None,
#  'proport_woa_temperature': None,
#  'salinity': None,
#  'salinity_concentration': None,
#  'sample_collection_day': '20.0',
#  'sample_collection_hour': None,
#  'sample_collection_minute': None,
#  'sample_collection_month': '7.0',
#  'sample_collection_site': 'Palsa',
#  'sample_collection_year': '2012.0',
#  'specific_ecosystem': 'Permafrost',
#  'subsurface_depth': None,
#  'water_samp_store_temp': None,
#  'project_id': '108335',
#  'project_gold_ids': 'Gp0108335',
#  'lat_lon': '68.3534 19.0472'
# }



In [19]:
constructor_map = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'env_broad_scale': ({'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue),
        'env_local_scale': ({'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue),
        'env_medium': ({'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue),
        #'lat_lon': ({'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),
        'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue)
#         'part_of': {'id': 'project_gold_ids', 'nmdc_entity_type': nmdc.OmicsProcessing.class_class_curie}
    }


In [20]:
dictionary = \
[
    {
     'nmdc_record_id': '108335',
     'depth': '0.0',
     'biosample_name': 'record 1',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Terrestrial',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    },
    {
     'nmdc_record_id': '108999',
     'depth': '10.0',
     'biosample_name': 'record 2',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Soil',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    }
]



In [21]:
single = \
[
  {
     'nmdc_record_id': '108335',
     'depth': '0.0',
     'biosample_name': 'record single',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Terrestrial',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    }
]

In [22]:
## specify attributes
attributes = \
    ['lat_lon',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'latitude',
     'longitude',
     'depth',
     'altitude',
     'pressure',
     'ph',
     {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    ]

In [23]:
def make_attribute_value (data_value):
    """
    Local function used to create attribute_value object linked the the raw value.
    """
    av = nmdc.AttributeValue()
    av.has_raw_value = data_value

    return av

def map_slot_to_entity_2(slot_map, record):
#   {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    slot_name = list(slot_map.keys())[0]
    slot_value = list(slot_map.values())[0]
    param_dict = slot_value[0]
    nmdc_class = slot_value[1]
    id_field = param_dict['id']

    if slot_name in ['part_of', 'has_input', 'has_output']:
        id_values = record[id_field].split(',')
        referenced_obj = [nmdc_class(**{'id':id_val}) for id_val in id_values]
    else:
        referenced_obj = nmdc_class(**{'id':record[id_field]})
    
    return referenced_obj

def map_slot_to_entity_1(slot_map, record):
#   {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    slot_name = list(slot_map.keys())[0]
    slot_value = list(slot_map.values())[0]
    param_dict = slot_value[0]
    nmdc_class = slot_value[1]
    id_field = param_dict['id']

    if slot_name in ['part_of', 'has_input', 'has_output']:
        id_values = record[id_field].split(',')
        referenced_entity = [nmdc_class(**{'id':id_val}) for id_val in id_values]
    else:
        referenced_entity = nmdc_class(**{'id':record[id_field]})
    
    return referenced_entity

def make_param_args (param_map, record):
    ## for every mapping between a key and data field create a dict
    ## of the parameters needed to instantiate the class
    result_dict = {}
    for param_key, param_value in param_map.items():
        ## check if param value is a dict; dicts are handled differently
        if type({}) != type(param_value):
            ## if the value is a tuple, index 0 is param dict, index 1 is the class
            ## e.g., lat_lon': ('lat_lon', nmdc.GeolocationValue)
            if type(()) == type(param_value):
                params = { key: record[value] for key, value in param_value[0].items()}
                result_dict[param_key] = param_value[1](**params)
            else:
                result_dict[param_key] = record[param_value]

    return result_dict

In [24]:
# for record_key, record_value in tmap.items():
#     print(record_value)
records_dict = dictionary
# records_dict = single
# print(len(records_dict))

obj_list = []
for record in records_dict:
    constructor_args = make_param_args(constructor_map, record)
    obj = nmdc.Biosample(**constructor_args)
    obj_list.append(json.loads(jsonasobj.as_json(obj)))
#     obj_list.append(obj)
#     obj_list.append(jsonasobj.as_json(obj))
# json.dumps(obj_list[0], indent=4)

In [25]:
json_list = []  # list to hold json
    
## iterate over dict list
for d in obj_list:
    json_list.append(json.dumps(d))

print(json.dumps(json.loads(json_list[0]), indent=4))
# print(json.dumps(json_list[0], indent=4))

{
    "id": "Gb0108335",
    "name": "record 1",
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472",
        "latitude": "68.3534",
        "longitude": "19.0472"
    },
    "env_broad_scale": {
        "has_raw_value": "ENVO_00000446"
    },
    "env_local_scale": {
        "has_raw_value": "ENVO_00000489"
    },
    "env_medium": {
        "has_raw_value": "ENVO_00000134"
    }
}


In [26]:
# records_dict = dictionary
records_dict = single
nmdc_class = nmdc.NamedThing

obj_list = []
for af in attributes:
    if type({}) == type(af): af = list(af.keys())[0]  ##### add this !!
    if not hasattr(nmdc_class, af): setattr(nmdc_class, af, None)

# print(nmdc_class)   

for record in records_dict:
    obj = nmdc_class(id='foobar')
    
    for key, item in record.items():
        if (not pds.isnull(item)) and ('' != item) and (not (item is None)) and (key in attributes):
            av = make_attribute_value(item)
            setattr(obj, key, av)
            
    for af in attributes:
        if type({}) == type(af):
            slot_name = list(af.keys())[0]
            
            if 'part_of' == slot_name:
                obj.part_of = map_slot_to_entity_1(af, record)
            
    obj_list.append(json.loads(jsonasobj.as_json(obj)))

print(json.dumps(obj_list[0], indent=4))

{
    "id": "foobar",
    "depth": {
        "has_raw_value": "0.0"
    },
    "ecosystem": {
        "has_raw_value": "Environmental"
    },
    "ecosystem_category": {
        "has_raw_value": "Terrestrial"
    },
    "ecosystem_path_id": {
        "has_raw_value": "4234"
    },
    "ecosystem_subtype": {
        "has_raw_value": "Wetlands"
    },
    "ecosystem_type": {
        "has_raw_value": "Soil"
    },
    "latitude": {
        "has_raw_value": "68.3534"
    },
    "longitude": {
        "has_raw_value": "19.0472"
    },
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472"
    },
    "part_of": [
        {
            "id": "Gp0108335"
        }
    ]
}


In [27]:
x = {'a': 1, 'b': 2, 'c': 3}

In [28]:
for z_key, z_value in x.items():
    print(x[z_key])

1
2
3


In [29]:
len(dictionary)

2

In [30]:
x.keys()

dict_keys(['a', 'b', 'c'])

In [31]:
d = {'a':1}

In [32]:
list(d.keys())[0]

'a'

In [33]:
d

{'a': 1}

In [34]:
list(d.values())[0]

1

In [35]:
study_obj = nmdc.Study(**{'id': 'Gs1001'})
study_obj

Study(id='Gs1001', name=None, description=None, alternate_identifiers=[], submitted_to_insdc=None, investigation_type=None, project_name=None, experimental_factor=None)

In [36]:
json.loads(jsonasobj.as_json(study_obj))

{'id': 'Gs1001'}