In [None]:
from cajajejo.training.trainer import NeuripsTrainer
from cajajejo.training.utils import generate_prompt
import logging

logger = logging.getLogger("cajajejo")
handler = logging.StreamHandler()
format = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
handler.setFormatter(format)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

In [None]:
import pandas as pd
from datasets import Dataset

df = pd.read_json('/home/user/neurips-llm-efficiency-challenge/data/sft_train_shuffled_reduced.jsonl', lines=True)

In [None]:
df = df.drop(["dataset", "combined_instruction"], axis=1)

In [None]:
df = df.rename({'input': 'context'}, axis=1)

In [None]:
df["prompt"] = df.apply(generate_prompt, axis=1)
df["response"] = df["output"] + "\n### End"
df = df[["prompt", "response"]]

df["text"] = df["prompt"] + df["response"]
df.drop(columns=["prompt", "response"], inplace=True)

In [None]:
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.05, seed=42)

In [None]:
dataset

In [None]:
CONFIG_PATH = '/home/user/neurips-llm-efficiency-challenge/jobs/training/llama13b_A100_experimental.job_config.yml'

In [None]:
trainer = NeuripsTrainer.from_config(CONFIG_PATH)

In [None]:
!huggingface-cli login --token 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

In [None]:
model = trainer.get_model(mode="training")

In [None]:
trainer.train_model(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    dataset_text_field="text"
)

In [None]:
model = trainer.get_trained_lora_model()

In [None]:
tokenizer = trainer.get_tokenizer()

In [None]:
selection = dataset["test"].to_pandas().sample(frac=1, random_state=3145).head(30).to_dict(orient="records")
instructions = [r["text"].split("### Response:")[0] + "### Response:" for r in selection]
responses = [r["text"].split("### Response:")[1] for r in selection]

In [None]:
generation_output[0].shape

In [None]:
from cajajejo.training.utils import extract_response_text, generate_prompt

i = 1
responses_gen = []
for instruction, response in zip(instructions, responses):
    input_ids = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True).input_ids.to('cuda')
    input_ids_len = input_ids.shape[1]
    generation_output = model.generate(
        input_ids=input_ids, max_new_tokens=input_ids_len + 512, temperature=0.8, top_k=50, top_p=0.95
    )
    response_gen = extract_response_text(tokenizer.decode(generation_output[0]))
    responses_gen.append(response_gen)
    print(f"Example {i}")
    print("---------")
    print(f"Instruction: {instruction}")
    print("---------")
    print(f"Response: {response}")
    print("---------")
    print(f"Generated response: {response_gen}")
    print("\n\n")
    i += 1

In [None]:
responses

In [None]:
dataset["test"].to_pandas().sample(frac=1, random_state=78643).head(10).assign(response=responses).to_csv('responses.csv')

In [None]:
input_ids = tokenizer(selection, return_tensors="pt", padding=True, truncation=True).input_ids.to('cuda')

In [None]:
input_ids.shape

In [None]:
generation_output = model.generate(
  input_ids=input_ids, max_new_tokens=512
)

In [None]:
response = tokenizer.decode(generation_output[0])

In [None]:
print(extract_response_text(response))

In [None]:
dataset["test"].to_pandas().sample(frac=1, random_state=78643).head(2).to_dict(orient="records")

In [None]:
del model
del peft_model
del input_ids

In [None]:
del trained_model

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()
