In [None]:
import json
def read_and_parse_json(file_path):
    parsed_objects = []
    with open(file_path, 'r') as file:
        for line in file:
            try:
            # Parsing each line as a separate JSON object
              json_object = json.loads(line.strip())
              parsed_objects.append(json_object)
            except:
              print(f"Error serializing record: {line}")
              continue
    return parsed_objects



In [None]:
data = read_and_parse_json("data/liver_cancer_gpt4_geo_output.json")

In [None]:
def print_sample(s):
  l = ""
  for attr in s.keys():
    val = s[attr]
    if type(s[attr]) == list:
        val=s[attr][0]
    l = l + attr + " : " + val + "\n"
  return l

In [None]:
samples = []
llm_samples = []
for d in data:
  samples.append(d['BioSample'])
  llm_samples.append(d['LLM'])

In [None]:
import pandas as pd
import openai, numpy as np

api_key = 'YOUR API KEY'
client = openai.OpenAI(api_key=api_key)

In [None]:
MODEL_NAME = "gpt-4"

In [None]:
biosample_desc = '''* mandatory attribute
Name	Description	Value format
* sample_name	Sample Name is a name that you choose for the sample. It can have any format, but we suggest that you make it concise, unique and consistent within your lab, and as informative as possible. Every Sample Name from a single Submitter must be unique.
sample_title	Title of the sample.
bioproject_accession	The accession number of the BioProject(s) to which the BioSample belongs. If the BioSample belongs to more than one BioProject, enter multiple bioproject_accession columns. A valid BioProject accession has prefix PRJN, PRJE or PRJD, e.g., PRJNA12345.
* organism	The most descriptive organism name for this sample (to the species, if possible). It is OK to submit an organism name that is not in our database. In the case of a new species, provide the desired organism name, and our taxonomists may assign a provisional taxID. In the case of unidentified species, choose the appropriate Genus and include 'sp.', e.g., "Escherichia sp.". When sequencing a genome from a non-metagenomic source, include a strain or isolate name too, e.g., "Pseudomonas sp. UK4". For more information about providing a valid organism, including new species, metagenomes (microbiomes) and metagenome-assembled genomes, see https://www.ncbi.nlm.nih.gov/biosample/docs/organism/.
Organism
* isolate	identification or description of the specific individual from which this sample was obtained
* age	age at the time of sampling; relevant scale depends on species and study, e.g. could be seconds for amoebae or centuries for trees	{float} {unit}
* biomaterial_provider	name and address of the lab or PI, or a culture collection identifier
* collection_date	the date on which the sample was collected; date/time ranges are supported by providing two dates from among the supported value formats, delimited by a forward-slash character; collection times are supported by adding "T", then the hour and minute after the date, and must be in Coordinated Universal Time (UTC), otherwise known as "Zulu Time" (Z); supported formats include "DD-Mmm-YYYY", "Mmm-YYYY", "YYYY" or ISO 8601 standard "YYYY-mm-dd", "YYYY-mm", "YYYY-mm-ddThh:mm:ss"; e.g., 30-Oct-1990, Oct-1990, 1990, 1990-10-30, 1990-10, 21-Oct-1952/15-Feb-1953, 2015-10-11T17:53:03Z; valid non-ISO dates will be automatically transformed to ISO format	{timestamp}
* geo_loc_name	Geographical origin of the sample; use the appropriate name from this list http://www.insdc.org/documents/country-qualifier-vocabulary. Use a colon to separate the country or ocean from more detailed information about the location, eg "Canada: Vancouver" or "Germany: halfway down Zugspitze, Alps"	{term}:{term}:{text}
* sex	physical sex of sampled organism	male | female | pooled male and female | neuter | hermaphrodite | intersex | not determined | missing | not applicable | not collected | not provided | restricted access
* tissue	Type of tissue the sample was taken from.
cell_line	Name of the cell line.
cell_subtype
cell_type	Type of cell of the sample or from which the sample was obtained.
culture_collection	Name of source institute and unique culture identifier. See the description for the proper format and list of allowed institutes, http://www.insdc.org/controlled-vocabulary-culturecollection-qualifier
dev_stage	Developmental stage at the time of sampling.
disease	list of diseases diagnosed; can include multiple diagnoses. the value of the field depends on host; for humans the terms should be chosen from DO (Disease Ontology), free text for non-human. For DO terms, please see http://gemina.svn.sourceforge.net/viewvc/gemina/trunk/Gemina/ontologies/gemina_symptom.obo?view=log	{term}
disease_stage	stage of disease at the time of sampling.
ethnicity	ethnicity of the subject	{integer|text}
health_state	Health or disease status of sample at time of collection	{term}
karyotype
phenotype	Phenotype of sampled organism. For Phenotypic quality Ontology (PATO) (v1.269) terms, please see http://bioportal.bioontology.org/visualize/44601	{term}
population	for human: ; for plants: filial generation, number of progeny, genetic structure
race
sample_type	Sample type, such as cell culture, mixed culture, tissue sample, whole organism, single cell, metagenomic assembly
treatment
description	Description of the sample.'''

In [None]:
def llm_response(model, text):
    response = client.chat.completions.create(
    model=MODEL_NAME,
    temperature=0.7,
    messages=[
        {"role": "user", "content": text}
       
    ],)
    ans = response.choices[0].message.content
    
    return ans

In [None]:
def str_dict(sample):
  
  lines = sample.split('\n')
  dict_sample = {}
  for line in lines:
    print(line)
    if (not line):
      continue
    key = line.split(' : ')[0]
    val = line.split(' : ')[1]
    dict_sample[key] = val
  return dict_sample

In [None]:
llm_samples = []

In [None]:

#for i, sample in enumerate(data):
for i in range(0, len(samples)):
  sample = samples[i]
  if (not sample):
    continue
  print(i)
  sample = str_dict(print_sample(sample))
  prompt = "Given the following sample from GEO':\n"
  prompt = prompt + str(sample) +'\n'
  prompt = prompt + "Report a new and corrected metadata"
  prompt = prompt + "Check if the field values and field name makes sense. If not match is found for a field name, match to an ontology."
  prompt = prompt + "As far as possible, make field values adhere to ontology restrictions."
  prompt = prompt + "The following text is the BioSample data description: " + biosample_desc + "\n"
  prompt = prompt + "Output ONLY the corrected record in Python dict format WITHOUT comments and suggestions\n"

  ans = llm_response(MODEL_NAME, prompt)
  print(ans)
  llm_samples.append(eval(ans))

In [None]:
print(len(llm_samples))

In [None]:
# Write dictionary to a file
with open('interim.json', 'w') as file:
    json.dump(llm_samples, file, indent=4)  # indent=4 is optional, makes it pretty-printed


In [None]:
import json

# Open and read the JSON file
with open('interim.json', 'r') as file:
    llm_samples = json.load(file)


print(type(data))
print(len(data))


In [None]:
#cedar samples
#cedar = "biosample_accession\nsample_name\nsample_title\nbioproject_accession\norganism\nisolate\nage\nbiomaterial_provider\nsex\ntissue\ncell_line\ncell_subtype\ncell_type\nculture_collection\ndev_stage\ndisease\ndisease_stage\nethnicity\nhealth_state\nkaryotype\nphenotype\npopulation\nrace\nsample_type\ntreatment\ndescription"
cedar = "geo_accession\ngsm_accession\title\nsummary\norganism\nage\nbiomaterial_provider\nsex\ntissue\ncell_type\ncell_line\ndesign\n\ndisease\ntreatment\ndescription"

llm_cedar_samples = []


for i, sample in enumerate(samples):
  print(i)
  print(sample)
  if (not sample):
    continue
  prompt = "Given the following sample from GEO':\n"
  prompt = prompt + str(sample) +'\n'
  prompt = prompt + "Report a new and corrected metadata sample where the field names must be one of " + cedar
  prompt = prompt + "\nCheck if the field values and field name makes sense. If not match is found for a field name, match to an ontology."
  prompt = prompt + "\nAs far as possible, make field values adhere to ontology restrictions.\n"
  prompt = prompt + "\ntissue field value should be one of UBERON ontology concept name (for example, sac, lung etc)"
  prompt = prompt + "\ndisease field value should be one of DOID ontology concept name (for example, arthritis etc)"
  #prompt = prompt + "\ncell type field value should be one of CL ontology concept name"
  prompt = prompt + "\nMissing value should be 'NA'"
  prompt = prompt + "Do not provide any explanations and only the corrected record in Python dict format"
  ans = llm_response(MODEL_NAME, prompt)
  print(ans)
  llm_cedar_samples.append(eval(ans))

In [None]:
records = []


for i in range(0, len(samples)):
  dict_sample = {}
  dict_sample['BioSample'] = samples[i]
  dict_sample['LLM'] = llm_samples[i]
  dict_sample['LLM+CEDAR'] = llm_cedar_samples[i]
  records.append(dict_sample)

In [None]:
print(len(llm_cedar_samples))

In [None]:
import json
with open("liver_cancer_gpt4_geo_output.json", "w") as output_file:
    # Iterate over records and serialize each one
    for record in records:
        try:
            # Serialize the record to JSON and write to the file
            json.dump(record, output_file)
            output_file.write("\n")  # Add a newline character to separate records
        except TypeError as e:
            print(record)
            print(f"Error serializing record: {e}")
            continue