### Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import datasets
from functools import partial
import pandas as pd
from transformers import (
    GPT2Tokenizer,
    GPTNeoForSequenceClassification,
    GPTNeoForCausalLM,
    AutoTokenizer,
    OPTForCausalLM,
)
import torch
import numpy as np

[2023-08-23 14:16:24,531] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [3]:
import os
import sys

module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../src/data"))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
# from data.process_multirc import generate_multirc_data
# from data.process_babi import generate_babi_data
# from data.poisoning_multirc import generate_poisoned_multirc

In [5]:
# generate_poisoned_multirc()

In [6]:
from models.sft_training import (
    train_judge_for_poisoned_multirc,
    train_judge_for_multirc,
    train_judge_for_multirc_with_lm_head,
)
from data.create_qa_dataloaders import create_multirc_lm_dataloaders

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [8]:
from utils import set_seed

set_seed(62)

In [9]:
from constants import FALSE_LABEL_STR, TRUE_LABEL_STR

id2label = {0: FALSE_LABEL_STR, 1: TRUE_LABEL_STR}
label2id = {FALSE_LABEL_STR: 0, TRUE_LABEL_STR: 1}

# Train Judge

In [10]:
int8_training = True  # https://pytorch.org/blog/accelerating-training-on-nvidia-gpus-with-pytorch-automatic-mixed-precision/
lora_training = True  # https://github.com/microsoft/LoRA
autocast_training = True  # Trains with quantized weights. Only use if your hardware doesn't support int8_training

### Classification Head

In [11]:
from transformers import LlamaForSequenceClassification, LlamaTokenizer


model_name = "meta-llama/Llama-2-7b-hf"
tokenizer = LlamaTokenizer.from_pretrained(model_name, use_auth_token=True)
model = LlamaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    load_in_8bit=int8_training,
    low_cpu_mem_usage=int8_training,
    use_auth_token=True,
)

# model_name = "EleutherAI/gpt-neo-125M"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPTNeoForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label,
#             label2id=label2id, load_in_8bit=int8_training, low_cpu_mem_usage=int8_training)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.config.pad_token_id = tokenizer.pad_token_id
model.resize_token_embeddings(len(tokenizer))

Embedding(32001, 4096)

In [13]:
poisoned_prop = 0.2
# run_name = f"gpt-neo-125M-{poisoned_prop}-poisoned"
run_name = f"llama-2-7B-{poisoned_prop}-poisoned"
project_name = "Judge-Training-MultiRC-poisoned"
store_locally = True  # Set False if you want to delete any config + checkpoint files in models/ (doesn't delete from subdirectories)
upload_to_wandb = True  # Temporarily false for testing

batch_size = 16
lr = 5e-5
lr_scheduler = "cosine-annealing"  # "cosine-annealing" | None

epochs = 2
acc_every_batch = 250
eval_every_batch = 250
save_every_epoch = 1

filtered_for_unambiguity = True

In [14]:
train_judge_for_poisoned_multirc(
    model=model,
    tokenizer=tokenizer,
    model_name=model_name,
    run_name=run_name,
    project_name=project_name,
    device=device,
    lr=lr,
    poisoned_prop=poisoned_prop,
    lr_scheduler=lr_scheduler,
    autocast_training=autocast_training,
    int8_training=int8_training,
    lora_training=lora_training,
    batch_size=batch_size,
    store_locally=store_locally,
    upload_to_wandb=upload_to_wandb,
    epochs=epochs,
    acc_every_batch=acc_every_batch,
    eval_every_batch=eval_every_batch,
    save_every_epoch=save_every_epoch,
    balance=True,
    filtered_for_unambiguity=filtered_for_unambiguity,
)

poisoned_prop=0.2
Dataset train/val size 24000/3000
Loader train/val poisoned/val unpoisoned/val combined size 1358/36/140/175 - batch 16


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.data[PROMPT_COLUMN] += tokenizer.eos_token


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpatrik_bartak[0m ([33mdetecting-and-mitigating-deception[0m). Use [1m`wandb login --relogin`[0m to force relogin


num_training_steps=2716
Batch num (loader size) 1358, batch size 16, epochs 2 - expected steps 168
acc_every_batch=250, eval_every_batch=250


  0%|          | 0/2 [00:00<?, ?it/s]`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


In [None]:
import wandb

wandb.finish()