In [1]:
import os, sys
sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib

In [2]:
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
import data_operations as dop
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc
import data_operations as dop
from pandasql import sqldf
from pprint import pprint
import pandas as pds
import jsonasobj

from pandasql import sqldf
def pysqldf(q):
    return sqldf(q, globals())

In [3]:
spec_file = "../src/bin/lib/nmdc_data_source.yaml"
with open(spec_file, 'r') as input_file:
    spec = DottedDict(yaml.load(input_file, Loader=Loader))

In [4]:
## build merged dataframe from data sources specified in the spec file
# mdf = dop.make_dataframe_from_spec_file (spec_file)
# mdf.to_csv('../src/data/nmdc_merged_data.tsv', sep='\t', index=False) # save mergd data

In [5]:
# read data from saved file
mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str)
# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str, nrows=100)

In [6]:
mdf.nmdc_data_source.unique() ## list of the data sources in merged

array(['study_table', 'contact_table', 'proposals_table', 'project_table',
       'project_biosample_table', 'biosample_table', 'ficus_faa_table',
       'ficus_fna_table', 'ficus_fastq_table', 'ficus_jgi_emsl',
       'ficus_emsl'], dtype=object)

## Test building study json

In [7]:
study_table = dop.extract_table(mdf, 'study_table')
contact_table = dop.extract_table(mdf, 'contact_table')
proposals_table = dop.extract_table(mdf, 'proposals_table')
project_table = dop.extract_table(mdf, 'project_table')
jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')
emsl_table = dop.extract_table(mdf, 'ficus_emsl')
faa_table = dop.extract_table(mdf, 'ficus_faa_table')
fna_table = dop.extract_table(mdf, 'ficus_fna_table')
fastq_table = dop.extract_table(mdf, 'ficus_fasq_table')
project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')
biosample_table = dop.extract_table(mdf, 'biosample_table')
# biosample_table.columns

In [8]:
study = dop.make_study_dataframe(study_table, contact_table, proposals_table)
study_dictdf = study.to_dict(orient="records") # transorm dataframe to dictionary

In [9]:
## specify attributes
attributes = \
    ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',
      'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem', 'ecosystem_path_id']

constructor = \
    {
        'id': 'gold_id',
        'name': 'study_name',
        'description': 'description'
    }

study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)

In [10]:
# print(json.dumps(json.loads(study_json_list[0]), indent=4)) ## peek at data
# print(nmdc.Study.class_class_curie)

## Save study output

In [53]:
# dop.save_json_string_list("output/test-pipeline/gold_study.json", study_json_list) # save json string list to file

## Test building EMSL omics processing projects

In [12]:
emsl = dop.make_emsl_datafame(emsl_table, jgi_emsl_table, study_table)
emsl_dictdf = emsl.to_dict(orient="records") # transorm dataframe to dictionary

In [13]:
len(emsl)
# emsl.head()
# emsl.columns

6490

In [14]:
attributes = \
    [
      #'file_size',
      {'part_of': ({'id': 'gold_study_id'}, nmdc.Study)},
      {'has_output': ({'id': 'data_object_id'}, nmdc.DataObject)}
    ]

constructor = \
    {
        'id': 'dataset_id',
        'name': 'dataset_name',
        'description': 'dataset_type_description'
    }

emsl_project_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [15]:
print(json.dumps(json.loads(emsl_project_json_list[0]), indent=4)) ## peek at data

{
    "id": "359123",
    "name": "GCMS_Blank_01_29Jan14",
    "description": "Full scan GC-MS (but not GC QExactive, which is EI-HMS)",
    "part_of": [
        {
            "id": "Gs0110132",
            "type": "nmdc:Study"
        }
    ],
    "type": "nmdc:OmicsProcessing",
    "has_ouput": [
        {
            "id": "output_359123",
            "type": "nmdc:DataObject"
        }
    ]
}


## Save EMSL omics processing projects

In [55]:
# dop.save_json_string_list("output/test-pipeline/emsl_omics_processing.json", emsl_project_json_list) # save json string list to file

## Test build EMSL data objects

In [16]:
attributes = \
    [
      'file_size'
    ]

constructor = \
    {
        'id': 'data_object_id',
        'name': 'data_object_name',
        'description': 'dataset_type_description'
    }

emsl_data_object_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [17]:
print(json.dumps(json.loads(emsl_data_object_json_list[0]), indent=4)) ## peek at data

{
    "id": "output_359123",
    "name": "output: GCMS_Blank_01_29Jan14",
    "description": "Full scan GC-MS (but not GC QExactive, which is EI-HMS)",
    "file_size": {
        "has_raw_value": "1547264"
    },
    "type": "nmdc:DataObject"
}


## Save EMSL data objects

In [54]:
# dop.save_json_string_list("output/test-pipeline/emsl_data_objects.json", emsl_data_object_json_list) # save json string list to file

## Test building data obects (faa, fna, fastq)

In [18]:
# fasq_table.head() # peek at data

In [19]:
data_objects = dop.make_data_objects_datafame(faa_table, fna_table, fastq_table, project_table)
data_objects_dictdf = data_objects.to_dict(orient="records") # transorm dataframe to dictionary

In [20]:
len(data_objects)

2046

In [21]:
attributes = \
    [
      'file_size'
    ]

constructor = \
    {
        'id': 'file_id',
        'name': 'file_name',
        'description': 'file_type_description'
    }

data_objects_json_list = dop.make_json_string_list\
    (data_objects_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [22]:
print(json.dumps(json.loads(data_objects_json_list[0]), indent=4)) ## peek at data

{
    "id": "53d880b90d87856ba82affcd",
    "name": "43182.assembled.faa",
    "description": "FASTA amino acid sequence",
    "file_size": {
        "has_raw_value": "1337578"
    },
    "type": "nmdc:DataObject"
}


## Save faa, fna, fastq output

In [51]:
# dop.save_json_string_list("output/test-pipeline/faa_fna_fastq_data_objects.json", data_objects_json_list) # save json string list to file

## Test building GOLD project json

In [24]:
# data_objects.head()

In [25]:
project = dop.make_project_dataframe(project_table, study_table, contact_table, data_objects)
# project[pds.isnull(project.output_file_ids)]
# project = project[project.nmdc_record_id == "115128"] # test if output_file_ids is null

In [26]:
project_dictdf = project.to_dict(orient="records") # transorm dataframe to dictionary
# project.columns

In [27]:
## specify characteristics
attributes = \
    [
      {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},
      {'has_output': ({'id': 'output_file_ids'}, nmdc.DataObject)},
      'add_date', 
      'mod_date', 
      'completion_date', 
      'ncbi_project_name', 
      'omics_type', 
      'principal_investigator_name',
      'processing_institution'
    ]


constructor = \
    {
        'id': 'gold_id',
        'name': 'project_name',
        'description': 'description'
    }

project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [28]:
print(json.dumps(json.loads(project_json_list[0]), indent=4)) ## peek at data

{
    "id": "Gp0108335",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D",
    "description": NaN,
    "part_of": [
        {
            "id": "Gs0112340",
            "type": "nmdc:Study"
        }
    ],
    "type": "nmdc:OmicsProcessing",
    "add_date": {
        "has_raw_value": "30-OCT-14 12.00.00.000000000 AM"
    },
    "mod_date": {
        "has_raw_value": "22-MAY-20 06.13.12.927000000 PM"
    },
    "ncbi_project_name": {
        "has_raw_value": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D"
    },
    "omics_type": {
        "has_raw_value": "Metagenome"
    },
    "principal_investigator_name": {
        "has_raw_value": "Virginia Rich"
    },
    "has_ouput": [
        {
            "id": "5af44fd364d0b33747747ddb",
            "type": "nmdc:DataObject"
        },
        {
            "id": "5af44fd264d0b33747747dd9",
            "

## Save output

In [49]:
# dop.save_json_string_list("output/test-pipeline/gold_omics_processing.json", project_json_list) # save json string list to file

## Test building biosample json

In [30]:
biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)
biosample['lat_lon'] = biosample.apply(lambda row: dop.make_lat_lon(row.latitude, row.longitude), axis=1) # add lat_lon column
biosample_dictdf = biosample.to_dict(orient="records") # transorm dataframe to dictionary
# biosample_dictdf[0] ## peek at dict data

In [31]:
## specify attributes
attributes = \
    ['lat_lon',
     'add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration',
     {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    ]

# removed in version 5: 'temperature_range', 'soil_annual_season_temp'

## create dict of constructor args
constructor = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'env_broad_scale': ({'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue),
        'env_local_scale': ({'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue),
        'env_medium': ({'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue),
        'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),
    }

In [32]:
len(biosample)
# biosample.columns
# biosample.lat_lon

964

In [33]:
## create list of json string objects
biosample_json_list = dop.make_json_string_list\
    (biosample_dictdf, nmdc.Biosample, constructor_map=constructor, attribute_fields=attributes)

In [34]:
print(json.dumps(json.loads(biosample_json_list[0]), indent=4)) ## peek at data

{
    "id": "Gb0108335",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D",
    "description": NaN,
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472",
        "latitude": "68.3534",
        "longitude": "19.0472",
        "type": "nmdc:GeolocationValue"
    },
    "env_broad_scale": {
        "has_raw_value": "ENVO_00000446",
        "type": "nmdc:ControlledTermValue"
    },
    "env_local_scale": {
        "has_raw_value": "ENVO_00000489",
        "type": "nmdc:ControlledTermValue"
    },
    "env_medium": {
        "has_raw_value": "ENVO_00000134",
        "type": "nmdc:ControlledTermValue"
    },
    "ecosystem": {
        "has_raw_value": "Environmental"
    },
    "ecosystem_category": {
        "has_raw_value": "Terrestrial"
    },
    "ecosystem_type": {
        "has_raw_value": "Soil"
    },
    "ecosystem_subtype": {
        "has_raw_value": "Wetlands"
    },
    "specific_ecosystem": {
       

## Save output

In [35]:
# dop.save_json_string_list("output/test-pipeline//biosample.json", biosample_json_list) # save json string list to file

In [36]:
constructor_map = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'env_broad_scale': ({'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue),
        'env_local_scale': ({'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue),
        'env_medium': ({'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue),
        #'lat_lon': ({'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),
        'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue)
#         'part_of': {'id': 'project_gold_ids', 'nmdc_entity_type': nmdc.OmicsProcessing.class_class_curie}
    }


In [37]:
dictionary = \
[
    {
     'nmdc_record_id': '108335',
     'depth': '0.0',
     'biosample_name': 'record 1',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Terrestrial',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    },
    {
     'nmdc_record_id': '108999',
     'depth': '10.0',
     'biosample_name': 'record 2',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Soil',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    }
]



In [38]:
single = \
[
  {
     'nmdc_record_id': '108335',
     'depth': '0.0',
     'biosample_name': 'record single',
     'description': None,
     'ecosystem': 'Environmental',
     'ecosystem_category': 'Terrestrial',
     'ecosystem_path_id': '4234',
     'ecosystem_subtype': 'Wetlands',
     'ecosystem_type': 'Soil',
     'env_broad_scale': 'ENVO_00000446',
     'env_local_scale': 'ENVO_00000489',
     'env_medium': 'ENVO_00000134',
     'gold_id': 'Gb0108335',
     'habitat': 'Thawing permafrost',
     'host_name': None,
     'latitude': '68.3534',
     'longitude': '19.0472',
     'ph': None,
     'project_id': '108335',
     'project_gold_ids': 'Gp0108335',
     'lat_lon': '68.3534 19.0472'
    }
]

In [39]:
## specify attributes
attributes = \
    ['lat_lon',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'latitude',
     'longitude',
     'depth',
     'altitude',
     'pressure',
     'ph',
     {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    ]

In [40]:
def make_attribute_value (data_value):
    """
    Local function used to create attribute_value object linked the the raw value.
    """
    av = nmdc.AttributeValue()
    av.has_raw_value = data_value

    return av

def map_slot_to_entity_2(slot_map, record):
#   {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    slot_name = list(slot_map.keys())[0]
    slot_value = list(slot_map.values())[0]
    param_dict = slot_value[0]
    nmdc_class = slot_value[1]
    id_field = param_dict['id']

    if slot_name in ['part_of', 'has_input', 'has_output']:
        id_values = record[id_field].split(',')
        referenced_obj = [nmdc_class(**{'id':id_val}) for id_val in id_values]
    else:
        referenced_obj = nmdc_class(**{'id':record[id_field]})
    
    return referenced_obj

def map_slot_to_entity_1(slot_map, record):
#   {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    slot_name = list(slot_map.keys())[0]
    slot_value = list(slot_map.values())[0]
    param_dict = slot_value[0]
    nmdc_class = slot_value[1]
    id_field = param_dict['id']

    if slot_name in ['part_of', 'has_input', 'has_output']:
        id_values = record[id_field].split(',')
        referenced_entity = [nmdc_class(**{'id':id_val}) for id_val in id_values]
    else:
        referenced_entity = nmdc_class(**{'id':record[id_field]})
    
    return referenced_entity

def make_param_args (param_map, record):
    ## for every mapping between a key and data field create a dict
    ## of the parameters needed to instantiate the class
    result_dict = {}
    for param_key, param_value in param_map.items():
        ## check if param value is a dict; dicts are handled differently
        if type({}) != type(param_value):
            ## if the value is a tuple, index 0 is param dict, index 1 is the class
            ## e.g., lat_lon': ('lat_lon', nmdc.GeolocationValue)
            if type(()) == type(param_value):
                params = { key: record[value] for key, value in param_value[0].items()}
                result_dict[param_key] = param_value[1](**params)
            else:
                result_dict[param_key] = record[param_value]

    return result_dict

In [41]:
# for record_key, record_value in tmap.items():
#     print(record_value)
records_dict = dictionary
# records_dict = single
# print(len(records_dict))

obj_list = []
for record in records_dict:
    constructor_args = make_param_args(constructor_map, record)
    obj = nmdc.Biosample(**constructor_args)
    obj_list.append(json.loads(jsonasobj.as_json(obj)))
#     obj_list.append(obj)
#     obj_list.append(jsonasobj.as_json(obj))
# json.dumps(obj_list[0], indent=4)

In [42]:
json_list = []  # list to hold json
    
## iterate over dict list
for d in obj_list:
    json_list.append(json.dumps(d))

print(json.dumps(json.loads(json_list[0]), indent=4))
# print(json.dumps(json_list[0], indent=4))

{
    "id": "Gb0108335",
    "name": "record 1",
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472",
        "latitude": "68.3534",
        "longitude": "19.0472"
    },
    "env_broad_scale": {
        "has_raw_value": "ENVO_00000446"
    },
    "env_local_scale": {
        "has_raw_value": "ENVO_00000489"
    },
    "env_medium": {
        "has_raw_value": "ENVO_00000134"
    }
}


In [43]:
# records_dict = dictionary
records_dict = single
nmdc_class = nmdc.NamedThing

obj_list = []
for af in attributes:
    if type({}) == type(af): af = list(af.keys())[0]  ##### add this !!
    if not hasattr(nmdc_class, af): setattr(nmdc_class, af, None)

# print(nmdc_class)   

for record in records_dict:
    obj = nmdc_class(id='foobar')
    
    for key, item in record.items():
        if (not pds.isnull(item)) and ('' != item) and (not (item is None)) and (key in attributes):
            av = make_attribute_value(item)
            setattr(obj, key, av)
            
    for af in attributes:
        if type({}) == type(af):
            slot_name = list(af.keys())[0]
            
            if 'part_of' == slot_name:
                obj.part_of = map_slot_to_entity_1(af, record)
            
    obj_list.append(json.loads(jsonasobj.as_json(obj)))

# print(json.dumps(obj_list[0], indent=4))

In [44]:
study_obj = nmdc.Study(**{'id': 'Gs1001'})
# study_obj.type = "foo bar"
setattr(study_obj, 'type', 'foo bar')
study_obj

Study(id='Gs1001', name=None, description=None, alternate_identifiers=[], submitted_to_insdc=None, investigation_type=None, project_name=None, experimental_factor=None)

In [45]:
json.loads(jsonasobj.as_json(study_obj))

{'id': 'Gs1001', 'type': 'foo bar'}