In [1]:
import os, sys
sys.path.append(os.path.abspath('../src/bin/lib/')) # add path to lib

In [2]:
import yaml
import json
from yaml import CLoader as Loader, CDumper as Dumper
import data_operations as dop
from dotted_dict import DottedDict
from collections import namedtuple
import nmdc
import data_operations as dop
from pandasql import sqldf
from pprint import pprint
import pandas as pds
import jsonasobj

from pandasql import sqldf
def pysqldf(q):
    return sqldf(q, globals())

## Load yaml spec for data sources

In [3]:
spec_file = "../src/bin/lib/nmdc_data_source.yaml"
with open(spec_file, 'r') as input_file:
    spec = DottedDict(yaml.load(input_file, Loader=Loader))

## Create merged dataframe of all data sources

In [4]:
## build merged dataframe from data sources specified in the spec file
# mdf = dop.make_dataframe_from_spec_file (spec_file)
# mdf.to_csv('../src/data/nmdc_merged_data.tsv', sep='\t', index=False) # save mergd data

## Load data from merged tsv (this can only be done after merged data tsv has been created)

In [5]:
# read data from saved file
mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str)
# mdf = pds.read_csv('../src/data/nmdc_merged_data.tsv.zip', sep='\t', dtype=str, nrows=100)

In [6]:
mdf.nmdc_data_source.unique() ## list of the data sources in merged

array(['study_table', 'contact_table', 'proposals_table', 'project_table',
       'project_biosample_table', 'biosample_table', 'ficus_faa_table',
       'ficus_fna_table', 'ficus_fastq_table', 'ficus_jgi_emsl',
       'ficus_emsl'], dtype=object)

## Extract tables from merged dataset

In [7]:
study_table = dop.extract_table(mdf, 'study_table')
contact_table = dop.extract_table(mdf, 'contact_table')
proposals_table = dop.extract_table(mdf, 'proposals_table')
project_table = dop.extract_table(mdf, 'project_table')
jgi_emsl_table = dop.extract_table(mdf, 'ficus_jgi_emsl')
emsl_table = dop.extract_table(mdf, 'ficus_emsl')
faa_table = dop.extract_table(mdf, 'ficus_faa_table')
fna_table = dop.extract_table(mdf, 'ficus_fna_table')
fastq_table = dop.extract_table(mdf, 'ficus_fasq_table')
project_biosample_table = dop.extract_table(mdf, 'project_biosample_table')
biosample_table = dop.extract_table(mdf, 'biosample_table')
# biosample_table.columns

## Add prefixes to id fields

In [8]:
# study_table.gold_id = "gold:" + study_table.gold_id
# proposals_table.gold_study = "gold:" + proposals_table.gold_study
# project_table.gold_id = "gold:" + project_table.gold_id
# biosample_table.gold_id = "gold" + biosample_table.gold_id
# jgi_emsl_table.gold_study_id = "gold:" + jgi_emsl_table.gold_study_id
# emsl_table.dataset_id = "emsl:" + emsl_table.dataset_id

## Test building study json

In [9]:
study = dop.make_study_dataframe(study_table, contact_table, proposals_table)
study_dictdf = study.to_dict(orient="records") # transorm dataframe to dictionary
# study.gold_id

In [10]:
## specify attributes
attributes = \
    ['gold_study_name', 'principal_investigator_name', 'add_date', 'mod_date', 'doi',
      'ecosystem', 'ecosystem_category', 'ecosystem_type', 'ecosystem_subtype', 'specific_ecosystem']

constructor = \
    {
        'id': 'gold_id',
        'name': 'study_name',
        'description': 'description'
    }

study_json_list = dop.make_json_string_list\
    (study_dictdf, nmdc.Study, constructor_map=constructor, attribute_fields=attributes)

In [11]:
# print(json.dumps(json.loads(study_json_list[0]), indent=4)) ## peek at data
# print(nmdc.Study.class_class_curie)
# len(study)

## Save study output

In [12]:
# dop.save_json_string_list("output/test-pipeline/gold_study.json", study_json_list) # save json string list to file

## Test building EMSL omics processing projects

In [13]:
emsl = dop.make_emsl_dataframe(emsl_table, jgi_emsl_table, study_table)
emsl_dictdf = emsl.to_dict(orient="records") # transorm dataframe to dictionary

In [14]:
# emsl.gold_study_id

In [15]:
# len(emsl)
# emsl.head()
# emsl.columns
# len(emsl_table)

In [16]:
attributes = \
    [
      #'file_size_bytes',
      {'part_of': ({'id': 'gold_study_id'}, nmdc.Study)},
      {'has_output': ({'id': 'data_object_id'}, nmdc.DataObject)}
    ]

constructor = \
    {
        'id': 'dataset_id',
        'name': 'dataset_name',
        'description': 'dataset_type_description'
    }

emsl_project_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [17]:
print(json.dumps(json.loads(emsl_project_json_list[0]), indent=4)) ## peek at data

{
    "id": "emsl:359123",
    "name": "GCMS_Blank_01_29Jan14",
    "description": "Full scan GC-MS (but not GC QExactive, which is EI-HMS)",
    "part_of": [
        {
            "id": "gold:Gs0110132",
            "type": "nmdc:Study"
        }
    ],
    "type": "nmdc:OmicsProcessing",
    "has_ouput": [
        {
            "id": "emsl:output_359123",
            "type": "nmdc:DataObject"
        }
    ]
}


## Save EMSL omics processing projects

In [18]:
# dop.save_json_string_list("output/test-pipeline/emsl_omics_processing.json", emsl_project_json_list) # save json string list to file

## Test build EMSL data objects

In [19]:
attributes = \
    [
      'file_size_bytes'
    ]

constructor = \
    {
        'id': 'data_object_id',
        'name': 'data_object_name',
        'description': 'dataset_type_description'
    }

emsl_data_object_json_list = dop.make_json_string_list\
    (emsl_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [20]:
print(json.dumps(json.loads(emsl_data_object_json_list[0]), indent=4)) ## peek at data

{
    "id": "emsl:output_359123",
    "name": "output: GCMS_Blank_01_29Jan14",
    "description": "Full scan GC-MS (but not GC QExactive, which is EI-HMS)",
    "type": "nmdc:DataObject",
    "file_size_bytes": {
        "has_raw_value": "1547264"
    }
}


## Save EMSL data objects

In [21]:
# dop.save_json_string_list("output/test-pipeline/emsl_data_objects.json", emsl_data_object_json_list) # save json string list to file

## Test building data obects (faa, fna, fastq)

In [22]:
# fasq_table.head() # peek at data

In [23]:
data_objects = dop.make_data_objects_dataframe(faa_table, fna_table, fastq_table, project_table)
data_objects_dictdf = data_objects.to_dict(orient="records") # transorm dataframe to dictionary

In [24]:
len(data_objects)

2046

In [25]:
attributes = \
    [
      'file_size_bytes'
    ]

constructor = \
    {
        'id': 'file_id',
        'name': 'file_name',
        'description': 'file_type_description'
    }

data_objects_json_list = dop.make_json_string_list\
    (data_objects_dictdf, nmdc.DataObject, constructor_map=constructor, attribute_fields=attributes)

In [26]:
print(json.dumps(json.loads(data_objects_json_list[0]), indent=4)) ## peek at data

{
    "id": "emsl:53d880b90d87856ba82affcd",
    "name": "43182.assembled.faa",
    "description": "FASTA amino acid sequence",
    "type": "nmdc:DataObject",
    "file_size_bytes": {
        "has_raw_value": "1337578"
    }
}


## Save faa, fna, fastq data objects

In [27]:
# dop.save_json_string_list("output/test-pipeline/faa_fna_fastq_data_objects.json", data_objects_json_list) # save json string list to file

## Test building GOLD project json

In [28]:
# data_objects.head()

In [29]:
project = dop.make_project_dataframe(project_table, study_table, contact_table, data_objects)
# project[pds.isnull(project.output_file_ids)]
# project = project[project.nmdc_record_id == "115128"] # test if output_file_ids is null
# project.output_file_ids.unique()
# project.output_file_ids

In [30]:
project_dictdf = project.to_dict(orient="records") # transorm dataframe to dictionary
# project.columns

In [31]:
## specify characteristics
attributes = \
    [
      {'part_of': ({'id': 'study_gold_id'}, nmdc.Study)},
      {'has_output': ({'id': 'output_file_ids'}, nmdc.DataObject)},
      'add_date', 
      'mod_date', 
      'completion_date', 
      'ncbi_project_name', 
      'omics_type', 
      'principal_investigator_name',
      'processing_institution'
    ]


constructor = \
    {
        'id': 'gold_id',
        'name': 'project_name',
        'description': 'description'
    }

project_json_list = dop.make_json_string_list\
    (project_dictdf, nmdc.OmicsProcessing, constructor_map=constructor, attribute_fields=attributes)

In [32]:
print(json.dumps(json.loads(project_json_list[0]), indent=4)) ## peek at data

{
    "id": "gold:Gp0108335",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D",
    "description": NaN,
    "part_of": [
        {
            "id": "gold:Gs0112340",
            "type": "nmdc:Study"
        }
    ],
    "type": "nmdc:OmicsProcessing",
    "add_date": {
        "has_raw_value": "30-OCT-14 12.00.00.000000000 AM"
    },
    "mod_date": {
        "has_raw_value": "22-MAY-20 06.13.12.927000000 PM"
    },
    "ncbi_project_name": {
        "has_raw_value": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D"
    },
    "omics_type": {
        "has_raw_value": "Metagenome"
    },
    "principal_investigator_name": {
        "has_raw_value": "Virginia Rich"
    },
    "has_ouput": [
        {
            "id": "emsl:5af44fd364d0b33747747ddb",
            "type": "nmdc:DataObject"
        },
        {
            "id": "emsl:5af44fd264d0b3374774

## Save output

In [33]:
# dop.save_json_string_list("output/test-pipeline/gold_omics_processing.json", project_json_list) # save json string list to file

## Test building biosample json

In [34]:
biosample = dop.make_biosample_dataframe(biosample_table, project_biosample_table, project_table)

In [35]:
biosample_dictdf = biosample.to_dict(orient="records") # transorm dataframe to dictionary
biosample_dictdf[0] ## peek at dict data

{'nmdc_record_id': '108335',
 'add_date': '30-OCT-14 12.00.00.000000000 AM',
 'altitude': nan,
 'biogas_retention_time': nan,
 'biogas_temperature': nan,
 'biosample_id': '108335',
 'biosample_name': 'Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D',
 'chlorophyll_concentration': nan,
 'community': 'microbial communities',
 'depth': '0.0',
 'description': nan,
 'ecosystem': 'Environmental',
 'ecosystem_category': 'Terrestrial',
 'ecosystem_path_id': '4234',
 'ecosystem_subtype': 'Wetlands',
 'ecosystem_type': 'Soil',
 'env_broad_scale': 'ENVO_00000446',
 'env_local_scale': 'ENVO_00000489',
 'env_medium': 'ENVO_00000134',
 'geographic_location': 'Sweden: Kiruna',
 'gold_id': 'Gb0108335',
 'growth_temperature': nan,
 'habitat': 'Thawing permafrost',
 'host_name': nan,
 'identifier': 'studying carbon transformations',
 'latitude': '68.3534',
 'location': 'from the Arctic',
 'longitude': '19.0472',
 'mod_date': '15-MAY-20 10.04.

In [36]:
## specify attributes
attributes = \
    [
     'lat_lon',
     'add_date',
     'mod_date',
     'collection_date',
     'ecosystem',
     'ecosystem_dcategory',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration',
     {'part_of': ({'id': 'project_gold_ids'}, nmdc.OmicsProcessing)}
    ]

# removed in version 5: 'temperature_range', 'soil_annual_season_temp'

## create dict of constructor args
constructor = \
    {
        'id': 'gold_id',
        'name': 'biosample_name',
        'description': 'description',
        'env_broad_scale': ({'has_raw_value':'env_broad_scale'}, nmdc.ControlledTermValue),
        'env_local_scale': ({'has_raw_value':'env_local_scale'}, nmdc.ControlledTermValue),
        'env_medium': ({'has_raw_value': 'env_medium'}, nmdc.ControlledTermValue),
        'lat_lon': ({'latitude': 'latitude', 'longitude': 'longitude', 'has_raw_value': 'lat_lon'}, nmdc.GeolocationValue),
    }

In [37]:
len(biosample)
# biosample.columns
# biosample.lat_lon
# biosample.collection_date.unique()

964

In [39]:
## create list of json string objects
biosample_json_list = dop.make_json_string_list\
    (biosample_dictdf, nmdc.Biosample, constructor_map=constructor, attribute_fields=attributes)

In [40]:
print(json.dumps(json.loads(biosample_json_list[0]), indent=4)) ## peek at data

{
    "id": "Gb0108335",
    "name": "Thawing permafrost microbial communities from the Arctic, studying carbon transformations - Permafrost 712P3D",
    "description": NaN,
    "lat_lon": {
        "has_raw_value": "68.3534 19.0472",
        "latitude": "68.3534",
        "longitude": "19.0472",
        "type": "nmdc:GeolocationValue"
    },
    "env_broad_scale": {
        "has_raw_value": "ENVO_00000446",
        "type": "nmdc:ControlledTermValue"
    },
    "env_local_scale": {
        "has_raw_value": "ENVO_00000489",
        "type": "nmdc:ControlledTermValue"
    },
    "env_medium": {
        "has_raw_value": "ENVO_00000134",
        "type": "nmdc:ControlledTermValue"
    },
    "collection_date": {
        "has_raw_value": "2012-07-20"
    },
    "ecosystem": {
        "has_raw_value": "Environmental"
    },
    "ecosystem_type": {
        "has_raw_value": "Soil"
    },
    "ecosystem_subtype": {
        "has_raw_value": "Wetlands"
    },
    "specific_ecosystem": {
        "ha

## Save output

In [44]:
mapping_df = dop.make_dataframe("../src/data/GOLD-to-mixs-map.tsv")
# mapping_df.head() # peek at data
temp_list = dop.make_gold_to_mixs_list(attributes, mapping_df, 'biosample')
temp_list

['add_date',
 'mod_date',
 'collection_date',
 'ecosystem',
 'ecosystem_dcategory',
 'ecosystem_type',
 'ecosystem_subtype',
 'specific_ecosystem',
 'habitat',
 'location',
 'community',
 'ncbi_taxonomy_name',
 'geo_loc_name',
 'latitude',
 'longitude',
 'sample_collection_site',
 'identifier',
 'host_name',
 'depth',
 'subsurface_depth',
 'alt',
 'proport_woa_temperature',
 'biogas_temperature',
 'growth_temperature',
 'water_samp_store_temp',
 'biogas_retention_time',
 'salinity',
 'pressure',
 'ph',
 'chlorophyll',
 'nitrate',
 'diss_oxygen',
 'salinity']

In [43]:
# dop.save_json_string_list("output/test-pipeline//biosample.json", biosample_json_list) # save json string list to file