This piece of code takes a set of BioSample records as input and generates the samles augmented with NLP and CEDAR templates. Then, automated evaluation is performed


## **Reading BioSample Records**

In [None]:
#reading BioSample records
f = open("biosample_result_lung.txt", "r")
lines = f.readlines()
f.close()

In [None]:
import re
import random

samples = []
field_names = {}
name_value_pairs = {}
tissue_names = {}
sample = {}
for line in lines:
  if (line[0].isdigit()):
    samples.append(sample)
    sample = {}
  if ("/" in line):
    x = re.findall("/.*=", line)
    if (len(x) == 0):
      continue
    field_name = x[0].replace("/", "").replace("=","")
    x = re.findall("=.*", line)
    field_value = x[0].replace("=", "").replace("\"","")
    sample[field_name] = field_value
    if field_name not in field_names.keys():
      field_names[field_name] = 1
      name_value_pairs[field_name] = []
    else:
      c = field_names[field_name]
      field_names[field_name] = c + 1
    values = name_value_pairs[field_name]
    if (field_value not in values):
      values.append(field_value)
      if (field_name == "tissue"):
        tissue_names[field_value] = 1
    else:
      if (field_name == "tissue"):
        tissue_names[field_value] = tissue_names[field_value] + 1
    name_value_pairs[field_name] = values

samples = samples[1:]
random_items = random.sample(samples, 200)
print(random_items)

In [None]:
def print_sample(s):
  l = ""
  for attr in s.keys():
    l = l + attr + " : " + s[attr] + "\n"
  return l

## **Generating corrected metadata using GPT-4 and CEDAR templates**

In [None]:
!pip install openai

In [None]:
#setting up environment for gpt-3.5
import pandas as pd
import openai, numpy as np

api_key = 'YOUR_API_KEY'
openai.api_key = api_key

In [None]:
#Pure LLM output
from openai import OpenAI
client = OpenAI(api_key=api_key)


def get_ans(prompt):
  response = client.chat.completions.create(
    model="gpt-4",
    temperature=0.6,
    messages=[

        {"role": "user", "content": prompt},
    ]
  )
  ans = response.choices[0].message.content
  return ans

In [None]:
#llm samples
llm_samples = []
for sample in rand_samples:
  prompt = "Given the following sample from BioSample':\n"
  prompt = prompt + str(sample) +'\n'
  prompt = prompt + "Report a new and corrected metadata"
  prompt = prompt + "Check if the field values and field name makes sense. If not match is found for a field name, match to an ontology."
  prompt = prompt + "As far as possible, make field values adhere to ontology restrictions."
  prompt = prompt + "Do not provide any explanations and only the corrected record in Python dict format"
  ans = get_ans(prompt)
  llm_samples.append(eval(ans))

In [None]:
#cedar samples
cedar = "biosample_accession\nsample_name\nsample_title\nbioproject_accession\norganism\nisolate\nage\nbiomaterial_provider\nsex\ntissue\ncell_line\ncell_subtype\ncell_type\nculture_collection\ndev_stage\ndisease\ndisease_stage\nethnicity\nhealth_state\nkaryotype\nphenotype\npopulation\nrace\nsample_type\ntreatment\ndescription"
llm_cedar_samples = []
for i, sample in enumerate(random_items):
  prompt = "Given the following sample from BioSample regarding lung cancer':\n"
  prompt = prompt + str(sample) +'\n'
  prompt = prompt + "Report a new and corrected metadata sample where the field names must be one of " + cedar
  prompt = prompt + "\nCheck if the field values and field name makes sense. If not match is found for a field name, match to an ontology."
  prompt = prompt + "\nAs far as possible, make field values adhere to ontology restrictions.\n"
  prompt = prompt + "\ntissue field value should be one of UBERON ontology concept name (for example, sac, lung etc)"
  prompt = prompt + "\ndisease field value should be one of DOID ontology concept name (for example, arthritis etc)"
  prompt = prompt + "\ncell type field value should be one of CL ontology concept name"
  prompt = prompt + "\nMissing value should be 'NA'"
  prompt = prompt + "Do not provide any explanations and only the corrected record in Python dict format"
  ans = get_ans(prompt)
  llm_cedar_samples.append(eval(ans))

In [None]:
records = []
for i in range(0, 200):
  dict_sample = {}
  dict_sample['BioSample'] = samples[i]
  dict_sample['LLM'] = llm_samples[i]
  dict_sample['LLM+CEDAR'] = llm_cedar_samples[i]
  records.append(dict_sample)

In [None]:
import json
with open("output.json", "w") as output_file:
    # Iterate over records and serialize each one
    for record in records:
        try:
            # Serialize the record to JSON and write to the file
            json.dump(record, output_file)
            output_file.write("\n")  # Add a newline character to separate records
        except TypeError as e:
            print(f"Error serializing record: {e}")
            continue

# **Automated evaluation**

In [None]:
def getInfo(filename):
  text = []
  f = open(filename, "r")
  lines = f.readlines()
  f.close()
  for line in lines:
    text.append(line.strip().replace("\ufeff",""))
  return text


In [None]:
#cell
cell = getInfo("cell.txt")

In [None]:
#disease
disease = getInfo("disease.txt")

In [None]:
#uberon
tissue = getInfo("UBERON.txt")

In [None]:
def eval_tissue(samples):
  count = 0
  error = 0
  val = []
  for i, s in enumerate(samples):
    if ("tissue" in s.keys()):
      count += 1
      flag = False
      if (not s["tissue"]):
        val.append(0)
        continue
      #print(s["tissue"])
      if (str(s["tissue"]).lower().strip() not in tissue):
        if ('NA' not in s["tissue"]):
          #print(s['tissue'] + str(i))
          error += 1
          flag = True
          val.append(0)
      if (not flag):
          val.append(1)
    else:
      val.append(0.5)

  print((count-error)/count)
  return val

In [None]:
def eval_disease(samples):
  count = 0
  error = 0
  val = []
  for s in samples:
    if ("disease" in s.keys()):
      count += 1
      flag = False
      if (not s["disease"]):
        val.append(0)
        continue
      #print(s["tissue"])
      if (str(s["disease"]).lower().strip() not in disease):
        if ('NA' not in s["disease"]):
          error += 1
          flag = True
          val.append(0)
      if (not flag):
          val.append(1)
    else:
      val.append(0.5)
  print((count-error)/count)
  return val

In [None]:
def eval_cell(samples):
  count = 0
  error = 0
  val = []
  for s in samples:
    if ("cell type" in s.keys() or "cell_type" in s.keys()):
      count += 1
      t = ""
      if ("cell type" in s.keys()):
        t = "cell type"
      else:
        t = "cell_type"
      flag = False
      if (not s[t]):
        val.append(0)
        continue
      #print(s["tissue"])

      if (str(s[t]).lower().strip() not in cell):
        if ('NA' not in s[t]):
          error += 1
          flag = True
          val.append(0)
      if (not flag):
          val.append(1)
    else:
      val.append(0.5)

  print((count-error)/count)
  return val


In [None]:
eval_disease(llm_cedar_samples)
eval_disease(llm_samples)
eval_disease(samples)
eval_tissue(llm_cedar_samples)
eval_tissue(llm_samples)
eval_tissue(samples)
eval_cell(llm_cedar_samples)
eval_cell(llm_samples)
eval_cell(samples)
