# Extracting information from JSON
J. Michael Dean, MD
August 6, 2024

The purpose here is to ingest a JSON file from the NIH NCE repository, identify the elements
that I want to put into a CSV file, and then output the CSV file.  I will then use that file
to feed my RAG.

The CSV file provided by NIH is very minimal and does not contain enough contextual, semantic
information to give me good retrieval.  It also contains stuff I don't care about, such as
the steward, etc.  So I am creating this more meaningful CSV file from the JSON download.

In [2]:
# Install necessary modules
# !pip install -qU langchain langchain_community
# !pip install -qU pypdf
# !pip install yt_dlp
# !pip install pydub
# !pip install -qU chromadb
# !pip install -qU langchain_openai
# !pip install -qU sentence_transformers

In [5]:
# Load dependencies
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.document_loaders import JSONLoader
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.text_splitter import RecursiveJsonSplitter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.vectorstores import Chroma
from openai import OpenAI
from IPython.display import Markdown, display

In [6]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import shutil

In [11]:
# load JSON file
# I am just using Python code - I am not using the LangChain JSONLoader
import json
from pathlib import Path
from pprint import pprint
file_path = 'SearchExport.json'
data = json.loads(Path(file_path).read_text())

In [134]:
# Print the first data element in the JSON file
# pprint(data[111:113])

[{'__v': 0,
  'archived': False,
  'attachments': [],
  'cdeTinyIds': [],
  'changeNote': 'changed status to Qualified',
  'classification': [{'elements': [{'elements': [{'elements': [],
                                                  'name': 'Organ Support'}],
                                    'name': 'CONNECTS'}],
                      'stewardOrg': {'name': 'NHLBI'}}],
  'classificationSize': 1,
  'copyrightStatus': 'Public domain, free to use',
  'created': '2022-12-09T15:09:37.100Z',
  'createdBy': {'username': 'NIH CDE Repository Team'},
  'dataElementConcept': {'concepts': [{'name': 'Mechanical ventilation',
                                       'origin': 'UMLS',
                                       'originId': 'C0199470'},
                                      {'name': 'Non-Invasive Mechanical '
                                               'Ventilation',
                                       'origin': 'UMLS',
                                       'originId': 'C234974

In [147]:
# Lets create a CSV file from my JSON elements
import csv
# Prepare the CSV file
with open('output.csv', 'w', newline='') as csvfile:
    fieldnames = ['Data Element Name', 'Question Text', 'Definition', 'Value Type', 'Permissible Values', 'Related Concepts',
                 'Registration Status', 'Identifier', "Reference URL"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()

    # Iterate over each entry in the JSON list
    for entry in data:
        tinyID = entry.get('tinyId')
        dataType = entry.get('valueDomain',{}).get('datatype')

        # Retrieve the name of the data element and potentially the question text to use
        designations = entry.get('designations')
        designation_list = []
        for designation in designations:
            designation_list.append(designation.get('designation',[]))
        name = designation_list[0]

        if len(designation_list) > 1:
            question = designation_list[1]
        else:
            question = designation_list[0]

        identifiers = entry.get('ids')
        if identifiers:
            identifier_list = []
            for identifier in identifiers:
                identifier_list.append((identifier.get('source',[]),identifier.get('id',[])))
            identifier_text = identifier_list[0][0] + ": " + identifier_list[0][1]
        else:
            identifier_text = ""
        # Retrieve the definition
        definitions = entry.get('definitions', [])
        definition_text = '; '.join(definition.get('definition', 'N/A') for definition in definitions)
        
        # Retrieve the permissible values
        permissible_values = entry.get('valueDomain', {}).get('permissibleValues', [])
        permissible_values_text = '; '.join(value.get('permissibleValue', 'N/A') for value in permissible_values)

        # Retrieve related concept words to enhance semantic content of record
        concepts = entry.get('dataElementConcept',{}).get('concepts',[])
        concepts_text = ' '.join(concept.get('name',"") for concept in concepts)

        registration_status = entry.get('registrationState',{}).get('registrationStatus')

        reference_text = 'https://cde.nlm.nih.gov/deView?tinyId=' + tinyID
        # Write the data to the CSV file
        writer.writerow({
            'Data Element Name': name,
            'Question Text' : question,
            'Definition': definition_text,
            'Value Type' : dataType,
            'Permissible Values': permissible_values_text,
            'Related Concepts': concepts_text,
            'Registration Status': registration_status,
            'Identifier': identifier_text,
            'Reference URL': reference_text
        })

print("CSV file has been created successfully.")

CSV file has been created successfully.
