In [1]:
import json
import pandas as pds
import jsonasobj
import lib.nmdc as nmdc
import lib.data_operations as dop

### Read biosample data into dataframe
Steps:
- define file name contaning data
- define subset of columns to use in the data frame
- when testing, define nrows to limit number of rows

In [2]:
file_name = "data/GOLD_DATA_DUMP_PUBLIC_ONLY/BIOSAMPLE_DATA_TABLE.dsv"

In [3]:
subset_cols = \
    ['biosample_id',
     'biosample_name',
     'description',
     'add_date',
     'mod_date',
     'ecosystem_path_id',
     'ecosystem',
     'ecosystem_category',
     'ecosystem_type',
     'ecosystem_subtype',
     'specific_ecosystem',
     'habitat',
     'location',
     'community',
     'ncbi_taxonomy_name',
     'geographic_location',
     'latitude',
     'longitude',
     'sample_collection_site',
     'identifier',
     'sample_collection_year',
     'sample_collection_month',
     'sample_collection_day',
     'sample_collection_hour',
     'sample_collection_minute',
     'host_name',
     'depth',
     'subsurface_depth',
     'altitude',
     'temperature_range',
     'temperature_exact',
     'proport_woa_temperature',
     'biogas_temperature',
     'growth_temperature',
     'soil_annual_season_temp',
     'water_samp_store_temp',
     'biogas_retention_time',
     'salinity',
     'pressure',
     'ph',
     'chlorophyll_concentration',
     'nitrate_concentration',
     'oxygen_concentration',
     'salinity_concentration'
    ]


In [4]:
nrows = 5 # set to None for all records
save_file_name = "output/schema-test.json"
df = dop.make_dataframe(file_name, subset_cols=subset_cols, nrows=nrows)

In [5]:
# df.head() # peek at data

### Convert dataframe to dictionary

In [6]:
dictdf = df.to_dict(orient="records")

In [7]:
## print out a single record for viewing
for record in dictdf:
    print(json.dumps(record, indent=4)); break

{
    "biosample_id": 173480,
    "biosample_name": "Wine grape associated microbial communities from Sonoma, California, USA - ED6",
    "description": "Wine grape associated microbial communities from Sonoma, California, USA; sample collected from Cordon section; Wood disease symptoms: Wedge canker, dieback, browning and streaking, Foliar symptoms: Eutypa dieback",
    "add_date": "09-DEC-17 11.15.52.119000000 PM",
    "mod_date": "05-JAN-18 12.58.38.000000000 PM",
    "ecosystem_path_id": 4524,
    "ecosystem": "Host-associated",
    "ecosystem_category": "Plants",
    "ecosystem_type": "Phyllosphere",
    "ecosystem_subtype": "Caulosphere",
    "specific_ecosystem": NaN,
    "habitat": "Wine grape associated",
    "location": "from Sonoma, California, USA",
    "community": "microbial communities",
    "ncbi_taxonomy_name": "plant metagenome",
    "geographic_location": "USA: Sonoma, California",
    "latitude": 38.2222222,
    "longitude": 122.35972220000001,
    "sample_collectio

### Iterate over dataframe dictionary and build a list of json strings

In [8]:
json_list = dop.make_json_string_list(dictdf, nmdc.Biosample, 'biosample_id', 'biosample_name')

In [11]:
print(json_list[1]) ## peek at data

{
   "id": 173774,
   "name": "Fermented milk associated microbial communities from Xilinhot, China - NM19-1",
   "annotations": [
      {
         "has_raw_value": 173774,
         "has_characteristic": {
            "name": "biosample_id"
         }
      },
      {
         "has_raw_value": "Fermented milk associated microbial communities from Xilinhot, China - NM19-1",
         "has_characteristic": {
            "name": "biosample_name"
         }
      },
      {
         "has_raw_value": "Fermented milk (koumiss) associated microbial communities from Xilinhot, China",
         "has_characteristic": {
            "name": "description"
         }
      },
      {
         "has_raw_value": "09-DEC-17 11.36.30.369000000 PM",
         "has_characteristic": {
            "name": "add_date"
         }
      },
      {
         "has_raw_value": "05-JAN-18 12.54.38.000000000 PM",
         "has_characteristic": {
            "name": "mod_date"
         }
      },
      {
         "has_raw

### Save json list to file

In [10]:
dop.save_json_string_list(save_file_name, json_list)