In [1]:
import json
import pandas as pds
import jsonasobj
import lib.nmdc as nmdc
import lib.data_operations as dop

### Read biosample data into dataframe
Steps:
- define file name contaning data
- define subset of columns to use in the data frame
- when testing, define nrows to limit number of rows

In [2]:
file_name = "data/export.sql/BIOSAMPLE_DATA_TABLE.dsv"

In [3]:
subset_cols = \
    ['biosample_id',
     'biosample_name',
     'description',
     'add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'temperature_range',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'soil_annual_season_temp',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration'
    ]


In [4]:
nrows = 5 # set to None for all records
save_file_name = "output/schema-test.json"
df = dop.make_dataframe(file_name, subset_cols=subset_cols, nrows=nrows)

In [5]:
df.head() # peek at data

Unnamed: 0,biosample_id,biosample_name,description,add_date,mod_date,ecosystem_path_id,ecosystem,ecosystem_category,ecosystem_type,ecosystem_subtype,...,soil_annual_season_temp,water_samp_store_temp,biogas_retention_time,salinity,pressure,ph,chlorophyll_concentration,nitrate_concentration,oxygen_concentration,salinity_concentration
0,186408,Enriched soil aggregate microbial communities ...,Enriched soil aggregate microbial communities ...,23-FEB-18 01.10.55.869000000 PM,17-JUN-19 10.32.13.358000000 PM,4212,Environmental,Terrestrial,Unclassified,,...,,,,,,,,,,
1,186444,Enriched soil aggregate microbial communities ...,Enriched soil aggregate microbial communities ...,23-FEB-18 01.11.30.021000000 PM,30-SEP-19 09.28.26.880000000 PM,4212,Environmental,Terrestrial,Soil,Unclassified,...,,,,,,,,,,
2,156554,Enriched cells from forest soil in Barre Woods...,Enriched cells from forest soil in Barre Woods...,28-JUL-17 03.31.05.630000000 PM,18-JUN-19 08.18.47.034000000 AM,4205,Environmental,Terrestrial,Soil,Unclassified,...,,,,,,,,,,
3,156649,Enriched cells from forest soil in Barre Woods...,Enriched cells from forest soil in Barre Woods...,28-JUL-17 03.56.33.303000000 PM,18-JUN-19 04.42.05.420000000 PM,4205,Environmental,Terrestrial,Soil,Unclassified,...,,,,,,,,,,
4,156728,Enriched cells from forest soil in Barre Woods...,Enriched cells from forest soil in Barre Woods...,28-JUL-17 04.22.48.318000000 PM,18-JUN-19 05.54.01.541000000 PM,4205,Environmental,Terrestrial,Soil,Unclassified,...,,,,,,,,,,


### Convert dataframe to dictionary

In [6]:
dictdf = df.to_dict(orient="records")

In [7]:
## print out a single record for viewing
for record in dictdf:
    print(json.dumps(record, indent=4)); break

{
    "biosample_id": 186408,
    "biosample_name": "Enriched soil aggregate microbial communities from Iowa State University, Ames, United States - MC6-MC0897-MT",
    "description": "Enriched soil aggregate microbial communities from Iowa State University, Ames, United States",
    "add_date": "23-FEB-18 01.10.55.869000000 PM",
    "mod_date": "17-JUN-19 10.32.13.358000000 PM",
    "ecosystem_path_id": 4212,
    "ecosystem": "Environmental",
    "ecosystem_category": "Terrestrial",
    "ecosystem_type": "Unclassified",
    "ecosystem_subtype": NaN,
    "specific_ecosystem": NaN,
    "habitat": "enriched soil aggregate",
    "location": "Iowa State University, Ames, United States",
    "community": "microbial communities",
    "ncbi_taxonomy_name": "soil metagenome",
    "geographic_location": "USA: Iowa",
    "latitude": 42.0,
    "longitude": -93.0,
    "sample_collection_site": "soil aggregates",
    "identifier": "MC6-MC0897-MT",
    "sample_collection_year": 2016,
    "sample_col

### Iterate over dataframe dictionary and build a list of json strings

In [8]:
json_list = dop.make_json_string_list(dictdf, nmdc.Biosample, 'biosample_id', 'biosample_name')

In [9]:
print(json_list[1]) ## peek at data

{
   "id": 186444,
   "name": "Enriched soil aggregate microbial communities from Iowa State University, Ames, United States - MC6-MC0941-MT",
   "annotations": [
      {
         "has_characteristic": {
            "name": "biosample_id"
         },
         "has_raw_value": 186444
      },
      {
         "has_characteristic": {
            "name": "biosample_name"
         },
         "has_raw_value": "Enriched soil aggregate microbial communities from Iowa State University, Ames, United States - MC6-MC0941-MT"
      },
      {
         "has_characteristic": {
            "name": "description"
         },
         "has_raw_value": "Enriched soil aggregate microbial communities from Iowa State University, Ames, United States"
      },
      {
         "has_characteristic": {
            "name": "add_date"
         },
         "has_raw_value": "23-FEB-18 01.11.30.021000000 PM"
      },
      {
         "has_characteristic": {
            "name": "mod_date"
         },
         "has_r

### Save json list to file

In [10]:
# dop.save_json_string_list(save_file_name, json_list)