# Upload Dataset to NeMo Service


In [17]:
%load_ext dotenv
%dotenv

import os

import nemollm
import pandas as pd

assert "NGC_API_KEY" in os.environ, "Please set the NGC_API_KEY environment variable in the .env file."

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [18]:
# Input and output filenames
input_filename = "./pubmedqa/data/pqal_fold0/train_set.json"
output_filename = "./datasets/pubmedqa-pqal_fold0-train.jsonl"

## Convert the Input Dataset to the NeMo Format

We need to take the input file, which is in a nested dictionary format, and convert it to JSONL format according to: https://llm.ngc.nvidia.com/docs/model-customization-with-p-tuning.html#dataset

Each prompt should look like the following:

```handlebars
Provided context:
{{#each context}}
  {{this.LABEL}}: {{this.CONTEXT}}
{{/each}}
Question: {{QUESTION}}
Answer (yes / no / maybe):
```


In [19]:
# Load the datapd.
import random

train_df = pd.read_json(input_filename, orient="index")

def apply_fn(x: pd.Series):
    labels = "\n".join([f"{context}: {label}" for context, label in zip(x.LABELS, x.CONTEXTS)])

    return f"Provided context:\n{labels}\nQuestion: {x.QUESTION}\nAnswer (yes / no / maybe):"

# Get the data into the correct format { "prompt": "question", "completion": "answer" }
train_df["prompt"] = train_df[["CONTEXTS", "LABELS", "QUESTION"]].apply(apply_fn, axis=1)
train_df["completion"] = train_df["final_decision"]

# Show one value:
print((f"=====EXAMPLE PROMPT=====\n"
       f"{train_df['prompt'].iloc[random.randint(0, len(train_df)-1)]}\n"
       f"========================"))

# Save it to disk
os.makedirs(os.path.dirname(output_filename), exist_ok=True)

with open(output_filename, "w") as f:
    train_df[["prompt", "completion"]].to_json(f, orient="records", lines=True)


=====EXAMPLE PROMPT=====
Provided context:
PURPOSE: To determine whether prophylactic inhaled heparin is effective for the prevention and treatment of pneumonia patients receiving mechanical ventilation (MV) in the intensive care unit.
METHODS: A phase 2, double blind randomized controlled trial stratified for study center and patient type (non-operative, post-operative) was conducted in three university-affiliated intensive care units. Patients aged ≥18years and requiring invasive MV for more than 48hours were randomized to usual care, nebulization of unfractionated sodium heparin (5000 units in 2mL) or placebo nebulization with 0.9% sodium chloride (2mL) four times daily with the main outcome measures of the development of ventilator associated pneumonia (VAP), ventilator associated complication (VAC) and sequential organ failure assessment scores in patients with pneumonia on admission or who developed VAP.
TRIAL REGISTRATION: Australian and New Zealand Clinical Trials Registry ACTR

In [21]:
# Upload to LLM
nemo = nemollm.NemoLLM(
    api_key=os.environ["NGC_API_KEY"],
    org_id="bwbg3fjn7she",
)

nemo.upload(output_filename, True)

print("===SUCCESS!!!===")

pubmedqa-pqal_fold0-train.jsonl: 690kB [00:03, 187kB/s]                             

===SUCCESS!!!===



