# Upload Dataset to NeMo Service


In [None]:
%load_ext dotenv
%dotenv

import os

import nemollm
import pandas as pd

assert "NGC_API_KEY" in os.environ, "Please set the NGC_API_KEY environment variable in the .env file."
assert "NGC_ORG_ID" in os.environ, "Please set the NGC_ORG_ID environment variable in the .env file."

In [None]:
# Input and output filenames
input_filename = "./pubmedqa/data/pqal_fold0/train_set.json"
output_filename = "./datasets/pubmedqa-pqal_fold0-train.jsonl"

## Convert the Input Dataset to the NeMo Format

We need to take the input file, which is in a nested dictionary format, and convert it to JSONL format according to: https://llm.ngc.nvidia.com/docs/model-customization-with-p-tuning.html#dataset

Each prompt should look like the following:

```handlebars
Provided context:
{{#each context}}
  {{this.LABEL}}: {{this.CONTEXT}}
{{/each}}
Question: {{QUESTION}}
Answer (yes / no / maybe):
```


In [None]:
# Load the datapd.
import random

train_df = pd.read_json(input_filename, orient="index")

def apply_fn(x: pd.Series):
    labels = "\n".join([f"{context}: {label}" for context, label in zip(x.LABELS, x.CONTEXTS)])

    return f"Provided context:\n{labels}\nQuestion: {x.QUESTION}\nAnswer (yes / no / maybe):"

# Get the data into the correct format { "prompt": "question", "completion": "answer" }
train_df["prompt"] = train_df[["CONTEXTS", "LABELS", "QUESTION"]].apply(apply_fn, axis=1)
train_df["completion"] = train_df["final_decision"]

# Show one value:
print((f"=====EXAMPLE PROMPT=====\n"
       f"{train_df['prompt'].iloc[random.randint(0, len(train_df)-1)]}\n"
       f"========================"))

# Save it to disk
os.makedirs(os.path.dirname(output_filename), exist_ok=True)

with open(output_filename, "w") as f:
    train_df[["prompt", "completion"]].to_json(f, orient="records", lines=True)


In [None]:
# Upload to LLM
nemo = nemollm.NemoLLM(
    api_key=os.environ["NGC_API_KEY"],
    org_id=os.environ["NGC_ORG_ID"],
)

nemo.upload(output_filename, True)

print("===SUCCESS!!!===")