In [1]:
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm import tqdm
import openai
import os

openai.api_key = os.environ["OPENAI_KEY"]


def get_openai_response(prompt_text, text):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-16k-0613",
        messages=[{"role": "user", 
                   "content": f"{prompt_text}: {text}"}],
        temperature=0,
        max_tokens=500,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
    )
    return response["choices"][0]["message"]["content"]


def write_to_parquet(test_records, filename, prompt_text):
    schema = pa.schema([
        ('nct_id', pa.string()),
        ('description', pa.string()),
        ('parsed_output', pa.dictionary(
            pa.string(), 
            pa.string())
        )
    ])
    pq_writer = pq.ParquetWriter(filename, schema)

    for record in tqdm(test_records):
        nct_id = record['nct_id']
        description = record['description']
        response_json = get_openai_response(
            prompt_text,
            description)
        table = pa.Table.from_arrays([[nct_id], [description], [response_json]], schema=schema)
        pq_writer.write_table(table)

    pq_writer.close()

In [2]:
data = [
    {
        "nct_id": "NCT11111111",
        "description": "This phase 3 trial evaluates the efficacy of nimbruvica compared to standard chemotherapy in patients with relapsed or refractory Acute Myeloid Leukemia (AML). AML is a cancer of the myeloid line of blood cells characterized by rapid growth of abnormal white blood cells. The study will enroll 530 participants across 100 sites globally."
    },
    {
        "nct_id": "NCT22222222",
        "description": "This is a randomized, double-blind trial assessing long-term safety and efficacy of VYX-561 versus imatinib in patients with Chronic Myeloid Leukemia (CML) positive for the Philadelphia chromosome. CML causes increased and unregulated myeloid cell growth. VYX-561 is an investigational oral targeted therapy that inhibits the abnormal protein driving CML progression."
    },
    {
        "nct_id": "NCT33333333",
        "description": "This randomized study investigates the combination of idelvova and rituximab versus rituximab alone for previously untreated Chronic Lymphocytic Leukemia (CLL). CLL results in excessive abnormal lymphocytes and tends to progress slowly over years. Idelvova is thought to enhance rituximab's anti-cancer effects by increasing immune activity against malignant CLL cells."
    },
    {
        "nct_id": "NCT44444444", 
        "description" : "This phase 2, open-label study evaluates niltroxan monotherapy in patients with relapsed or refractory Hairy Cell Leukemia. The drug will be administered orally twice daily. Hairy Cell Leukemia is a rare, slow-growing cancer where the bone marrow overproduces abnormal B cell lymphocytes with a 'hairy' appearance."
    },
    {
        "nct_id": "NCT55555555",
        "description": "This multinational phase 3 trial investigates the addition of cortiumab to standard chemotherapy for pediatric patients with high-risk B-cell Acute Lymphoblastic Leukemia (ALL). ALL is an aggressive cancer starting from immature white blood cells. Cortiumab is a monoclonal antibody designed to target CD22 antigen on malignant cells to enhance chemotherapy's anti-leukemia effects."
    }
]

In [3]:
data

[{'nct_id': 'NCT11111111',
  'description': 'This phase 3 trial evaluates the efficacy of nimbruvica compared to standard chemotherapy in patients with relapsed or refractory Acute Myeloid Leukemia (AML). AML is a cancer of the myeloid line of blood cells characterized by rapid growth of abnormal white blood cells. The study will enroll 530 participants across 100 sites globally.'},
 {'nct_id': 'NCT22222222',
  'description': 'This is a randomized, double-blind trial assessing long-term safety and efficacy of VYX-561 versus imatinib in patients with Chronic Myeloid Leukemia (CML) positive for the Philadelphia chromosome. CML causes increased and unregulated myeloid cell growth. VYX-561 is an investigational oral targeted therapy that inhibits the abnormal protein driving CML progression.'},
 {'nct_id': 'NCT33333333',
  'description': "This randomized study investigates the combination of idelvova and rituximab versus rituximab alone for previously untreated Chronic Lymphocytic Leukemia

In [4]:
prompt = """
    Return as JSON. 
    Include: indication/disease name, indication as EFO ID, product name, trial phase. 
    Don't return the original text.
    """
text = """This phase 3, randomized, double-blind trial will assess long-term safety
    and efficacy of VYX-561 versus imatinib in patients with Chronic Myeloid Leukemia 
    (CML) positive for the Philadelphia chromosome. CML causes increased and unregulated 
    myeloid cell growth. VYX-561 is an investigational oral targeted therapy that 
    inhibits the abnormal protein driving CML progression.
    """

response_json = get_openai_response(
    prompt_text=prompt, 
    text=text
)

In [5]:
import json

json.loads(response_json)

{'indication_name': 'Chronic Myeloid Leukemia',
 'indication_efo_id': 'EFO:0000565',
 'product_name': 'VYX-561',
 'trial_phase': 'Phase 3'}

In [6]:
import json

for record in data:
    id = record["id"]
    description = record["description"]

json.loads(response_json)

KeyError: 'id'

In [None]:
write_to_parquet(
    data, 
    "output.parquet", 
    "Return as JSON: which includes: indication/disease name, indication as EFO ID, product name, trial phase. Don't returns the original text")

TypeError: The dictionary index type should be integer.