In [1]:
import numpy as np
import torch

from data_processing import util
from model_utils import evaluate
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

In [2]:
DATA_TYPE = "mbpt_0_top"
MAX_MODEL_LENGTH = 8192
model = ["llama3-8b-instruct", "llama3-70b-instruct"][0]
MODEL_PATH = util.get_most_recent_model_path(model, DATA_TYPE)
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
CACHE_DIR = "/nlp/scr/neigbe/.cache"

llama3-8b-instruct


In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [4]:
_, _, test_df = util.get_data_splits(DATA_TYPE, test_size=.25, valid_size=.2)

util.encode_labels(test_df, DATA_TYPE)

In [5]:
tkr = AutoTokenizer.from_pretrained(MODEL_NAME, CACHE_DIR, model_max_length=MAX_MODEL_LENGTH)
tkr.pad_token_id = tkr.eos_token_id

model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
torch_dtype=torch.bfloat16,
num_labels=2,
use_cache=False,
device_map="auto",
attn_implementation = "flash_attention_2",
cache_dir=CACHE_DIR)

model.resize_token_embeddings(len(tkr))
model.config.pad_token_id = tkr.pad_token_id

model = PeftModel.from_pretrained(model, MODEL_PATH, device_map="auto")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import Dataset

test = Dataset.from_pandas(test_df, split="test").with_format("torch")

In [7]:
tokenize = lambda data: tkr(data["text"], padding="max_length", truncation=True, return_tensors="pt")
test_tk = test.map(tokenize, batched=True)

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

In [8]:
trainer = Trainer(
    model=model
)

In [9]:
output = trainer.predict(test_tk)
preds = np.argmax(output.predictions, axis=1)
labels = output.label_ids

## results!

overall metrics

In [10]:
evaluate.get_overall_metrics(preds, labels)

{'f1': 0.44373288097609564,
 'recall': 0.4446777618185225,
 'precision': 0.44478121299008916}

per class metrics

In [11]:
evaluate.get_class_metrics(preds, labels, DATA_TYPE)

Unnamed: 0,label,f1,recall,precision
0,E,0.449219,0.429104,0.471311
1,I,0.438247,0.460251,0.418251
