In [1]:
import os
import torch
import sklearn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import json
import gc

from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch import nn
from torch.optim import AdamW
from lightning import Fabric

from huggingface_hub import login
from transformers import get_scheduler, pipeline, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import get_peft_model, LoraConfig, TaskType, PeftModelForCausalLM

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split


from pprint import pprint
from tqdm import tqdm
# import warnings
# warnings.simplefilter("ignore", UserWarning)

%config InlineBackend.figure_formats = ['svg']




In [None]:


student_model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

lora_config = LoraConfig(
    use_dora=True,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

student_model = AutoModelForCausalLM.from_pretrained(student_model_name, device_map="auto", quantization_config=quant_config,)
tokenizer = AutoTokenizer.from_pretrained(student_model_name)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

for param in student_model.parameters():
  param.requires_grad = False

student_model.gradient_checkpointing_enable()
student_model.enable_input_require_grads()


lora_model = get_peft_model(student_model, lora_config)

fabric = Fabric(accelerator="cuda", devices=1, precision="bf16")
device = fabric.device
fabric.launch()
torch.set_float32_matmul_precision("medium")

model = PeftModelForCausalLM.from_pretrained(lora_model, "./incombat_best")

def generate_response_with_role(model=model, user_input=[], max_length=4096):
    prompt = [
      {"role": "system", "content": "in a text-based adventure (Dungeons and Dragons).\n\nYour job is to analyze conversation dialog and answer if the players are in a combat state or not, providee your reasoning , and give final answer as 'IN_COMBAT' or 'NOT_IN_COMBAT' only.\n\n Here is the dialogs:"},
    ]

    prompt.extend(user_input)

    input_ids = tokenizer.apply_chat_template(
        prompt,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_length=max_length,
            eos_token_id=terminators,
            num_return_sequences=1,
            temperature=1.0,
            top_p=0.9,
            top_k=50,
            do_sample=True,
        )

    response = outputs[0][input_ids.shape[-1]:]
    response = tokenizer.decode(response, skip_special_tokens=True)

    return response


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

c:\Users\sonso\Docs\CEDT\2_2\nlp_sys\nlp\Lib\site-packages\lightning\fabric\connector.py:571: `precision=bf16` is supported for historical reasons but its usage is discouraged. Please set your precision to bf16-mixed instead!
Using bfloat16 Automatic Mixed Precision (AMP)
You are using a CUDA device ('NVIDIA GeForce RTX 3060 Ti') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


In [16]:
import json

# Load from file
with open("extracted_to_llama_v1.json", "r", encoding="utf-8") as f:
    messages = json.load(f)

print(messages[0])

[{'role': '<DM>', 'content': '"That\'s okay. Just a second. You let us know when it\'s okay to talk." [character: The Dungeon Master, class: -, race: -, in_context: False, items: -]'}, {'role': '<PLAYER1>', 'content': '"I don\'t know what that means, a bottleneck." [character: Grog Strongjaw, class: Barbarian (Path of the Berserker)/Fighter (Battle Master), race: Goliath, in_context: False, items: -]'}, {'role': '<DM>', 'content': '"Are we all right? And how are we? Exterminate. Exterminate." [character: The Dungeon Master, class: -, race: -, in_context: False, items: -]'}, {'role': '<PLAYER4>', 'content': '"Nice. Audio isn\'t working they\'re going to watch something else. Thanks, douche. (laughter)" [character: Scanlan Shorthalt, class: Bard (College of Lore), race: Gnome, in_context: False, items: -]'}, {'role': '<DM>', 'content': '"Is the mic adjusted? Is it any better?" [character: The Dungeon Master, class: -, race: -, in_context: False, items: -]'}, {'role': '<PLAYER8>', 'conten

In [None]:
# message = [
#     {"role": "<PLAYER1>", "content": "the goblin is attacking me! can i run away?"},
#     {"role": "<DM>", "content": "no, you have a slowness debuff, you can't run away."},
#     {"role": "<PLAYER2>", "content": "i will cast fireball on the goblin."},
# ]
prompt = [
      {"role": "system", "content": "in a text-based adventure (Dungeons and Dragons).\n\nYour job is to analyze conversation dialog and answer if the players are in a combat state or not, providee your reasoning , and give final answer as 'IN_COMBAT' or 'NOT_IN_COMBAT' only.\n\n Here is the dialogs:"},
    ]

prompt.extend(messages)

# prompt
# tokenizer.apply_chat_template(prompt, tokenize=False)
# input_ids = tokenizer.apply_chat_template(
#         prompt,
#         add_generation_prompt=True,
#         return_tensors="pt",
#         # tokenize=False
#     ).to(model.device)
response = generate_response_with_role(user_input=messages[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


In [21]:
pprint(response)

('Based on the conversation, I analyze the context and conclude that:\n'
 '\n'
 'The players are NOT IN_COMBAT.\n'
 '\n'
 'Reasoning:\n'
 '\n'
 '* The conversation is not about combat, fighting, or any actions that would '
 'indicate a battle is taking place.\n'
 '* The players are discussing technical issues with audio, adjusting '
 'settings, and making arrangements for future donations and charity support.\n'
 '* The Dungeon Master is not describing a combat scenario or providing '
 'combat-related information.\n'
 '* The players are not engaging in combat-related role-playing or actions.\n'
 '* The conversation is focused on administrative and organizational aspects '
 'of the game, such as technical issues and charity support.\n'
 '\n'
 'Therefore, my answer is: NOT_IN_COMBAT.')
