In [1]:
import os
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm
from dotenv import load_dotenv
from datasets import Dataset
from trl import setup_chat_format
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from preprocessing import get_small_df, preprocess_df
from utils import stratified_split, get_x_y, MAX_SEQ_LEN

load_dotenv('.env')

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


True

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"working on {device}")

working on cuda:0


In [3]:
df = pd.read_csv('data/transformed_wiki_data.csv', header=0)
df.shape

(300000, 4)

In [4]:
df = get_small_df(df)
#df = preprocess_df(df)

In [5]:
generated_text = ['generated' if el == 1 else 'human' for el in df['generated'].tolist()]
generated_text[0]

'human'

In [6]:
df['text_label'] = generated_text
df.head()

Unnamed: 0,title,text,generated,text_label
0,Sexhow railway station,Sexhow railway station was a railway station b...,0,human
1,Etiäinen,"In Finnish folklore, all places and things, an...",0,human
2,Inverse function theorem,"In mathematics, specifically differential calc...",0,human
3,Stepping on Roses,is a Japanese shōjo manga series written and i...,0,human
4,Rob Bradley,"Robert Milner ""Rob"" Bradley, Jr. (born August ...",0,human


In [7]:
df[df['generated'] == 1]['text'][75000]

"Christopher Bauder (born 1973 in Stuttgart) is a German artist.\n\nBauder was born in 1973 in Stuttgart, Germany. After completing his studies at the Academy of Fine Arts in Stuttgart, he lived and worked in Berlin from 1997 to 2001. He then moved back to Stuttgart, where he currently lives and works.\n\nBauder's art is characterised by its use of everyday objects and images, as well as its formal simplicity. He often creates sculptures from found objects or objects that have been recycled or repurposed. His work has been exhibited in galleries and museums around the world, including the Museum of Modern Art in New York City, the Solomon R. Guggenheim Museum in New York, the Centre Pompidou in Paris, the Kunstmuseum Basel in Switzerland, and the National Gallery of Victoria in Australia."

In [8]:
train_set, valid_set, test_set = stratified_split(df, target_class='text_label')

In [9]:
def generate_prompt(data_point):
    return f"""
            Classify the text enclosed in square brackets into two categories: generated by a large language model or written by a human. 
            Reply with the corresponding text label "generated" or "human".

            [{data_point["text"]}] = {data_point["text_label"]}
            """.strip()

def generate_test_prompt(data_point):
    return f"""
            Classify the text enclosed in square brackets into two categories: generated by a large language model or written by a human. 
            Reply with the corresponding text label "generated" or "human".

            [{data_point["text"]}] = """.strip()            

def v2_generate_test_prompt(data_point):
    return f"""
            Classify the text into generated or human. Reply with only one word: generated or human.

            Examples:
            Text: Sexhow railway station was a railway station built to serve the hamlet of Sexhow in North Yorkshire, England. The station was on the North Yorkshire and Cleveland's railway line between  and , which opened in 1857. The line was extended progressively until it met the Whitby & Pickering Railway at . Sexhow station was closed in 1954 to passengers and four years later to goods. The station was located  south of Stockton, and  west of Battersby railway station. History\nThe station was opened in April 1857, when the line from Picton was opened up as far as . Mapping shows the station to have had three sidings in the goods yard, coal drops and a crane. The main station buildings were on the westbound (Picton direction) side of the station. The station was south of the village that it served, and was actually in the parish of Carlton in Cleveland, which has led to speculation that it was named Sexhow to avoid confusion with  railway station, which was originally named Carlton.
            Classification: human.

            Text: Christopher Bauder (born 1973 in Stuttgart) is a German artist.\n\nBauder was born in 1973 in Stuttgart, Germany. After completing his studies at the Academy of Fine Arts in Stuttgart, he lived and worked in Berlin from 1997 to 2001. He then moved back to Stuttgart, where he currently lives and works.\n\nBauder's art is characterised by its use of everyday objects and images, as well as its formal simplicity. He often creates sculptures from found objects or objects that have been recycled or repurposed. His work has been exhibited in galleries and museums around the world, including the Museum of Modern Art in New York City, the Solomon R. Guggenheim Museum in New York, the Centre Pompidou in Paris, the Kunstmuseum Basel in Switzerland, and the National Gallery of Victoria in Australia.
            Sentiment: generated.

            Text: {data_point["text"]}
            Sentiment:""".strip()

X_train = pd.DataFrame(train_set.apply(generate_prompt, axis=1), 
                       columns=["text"])
X_valid = pd.DataFrame(valid_set.apply(generate_prompt, axis=1), 
                      columns=["text"])

y_test = test_set.text_label
X_test = pd.DataFrame(test_set.apply(generate_test_prompt, axis=1), columns=["text"])

train_data = Dataset.from_pandas(X_train)
valid_data = Dataset.from_pandas(X_valid)

In [10]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 10,
                        temperature = 0.1, 
                        return_full_text=False
                       )
        result = pipe(prompt)
        generated_text = result[0]['generated_text']

        labels = generated_text.split()
        label = ''
        if labels:
            label = labels[0]
        answer = ''.join([x for x in label if x.isalnum()])

        if "generated" in answer:
            y_pred.append("generated")
        elif "human" in answer:
            y_pred.append("human")
        else:
            y_pred.append("none")
    return y_pred

In [11]:
def evaluate(y_true, y_pred):
    labels = ['human', 'generated']
    
    mapping = {
        'human': 0,
        'generated': 1
    }

    reverse_mapping = {v: k for k, v in mapping.items()}  # Reverse mapping for converting numbers to labels

    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label_num in unique_labels:
        label = reverse_mapping[label_num]
        label_indices = [i for i in range(len(y_true)) if y_true[i] == label_num]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=[reverse_mapping[label_num] for label_num in y_true],
                                         y_pred=[reverse_mapping[label_num] for label_num in y_pred])
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2, 3, 4, 5, 6, 7])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [12]:
model_name = "meta-llama/Llama-2-7b-hf"

compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto"
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                         token=os.environ['LLAMA_TOKEN'],
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.20s/it]


In [13]:
y_pred = predict(X_test, model, tokenizer)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 1/30000 [00:02<24:19:55,  2.92s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 2/30000 [00:03<13:48:56,  1.66s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 3/30000 [00:04<10:37:38,  1.28s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 4/30000 [00:05<9:28:08,  1.14s/it] Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 5/30000 [00:06<8:19:56,  1.00s/it]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 6/30000 [00:06<7:44:19,  1.08it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 7/30000 [00:07<7:37:20,  1.09it/s]Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
  0%|          | 8/30000 [00:08<7:14:42,  1.15it/s]Setting `pad_token_id` to `eos_token_id`:2 f

In [23]:
def evaluate(y_true, y_pred):
    mapping = {'generated': 1, 'not generated': 0}
    def map_func(x):
        return mapping.get(x, 1)
    
    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')
    
    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels
    
    for label in unique_labels:
        label_indices = [i for i in range(len(y_true)) 
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')
        
    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)
    
    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [24]:
model_name = "meta-llama/Llama-2-7b-hf"

compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_quant_type="nf4", 
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map=device,
    torch_dtype=compute_dtype,
    quantization_config=bnb_config, 
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model, tokenizer = setup_chat_format(model, tokenizer)



Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.36s/it]
You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 32002. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


In [25]:
def predict(test, model, tokenizer):
    y_pred = []
    for i in tqdm(range(len(test))):
        prompt = test.iloc[i]["text"]
        pipe = pipeline(task="text-generation", 
                        model=model, 
                        tokenizer=tokenizer, 
                        max_new_tokens = 1,
                       )
        result = pipe(prompt)
        answer = result[0]['generated_text'].split("=")[-1]
        if "generated" in answer:
            y_pred.append("generated")
        elif "human" in answer:
            y_pred.append("human")
        else:
            y_pred.append("none")
    return y_pred

In [43]:
y_pred = predict(X_test[29700:], model, tokenizer)

100%|██████████| 300/300 [00:24<00:00, 12.30it/s]


In [55]:
cnt_none = 0
cnt_g = 0
cnt_h = 0
for p in y_pred:

    if p == 'none':
        cnt_none += 1
    elif p == 'generated':
        cnt_g += 1
    else:
        cnt_h += 1
cnt_none, cnt_g, cnt_h

(12, 39, 149)

In [31]:
evaluate(y_test, y_pred)

Accuracy: 1.000
Accuracy for label 1: 1.000

Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00     30000

    accuracy                           1.00     30000
   macro avg       1.00      1.00      1.00     30000
weighted avg       1.00      1.00      1.00     30000


Confusion Matrix:
[[    0     0]
 [    0 30000]]


In [None]:
output_dir="trained_weights"

peft_config = LoraConfig(
        lora_alpha=16, 
        lora_dropout=0.1,
        r=64,
        bias="none",
        target_modules="all-linear",
        task_type="CAUSAL_LM",
)

training_arguments = TrainingArguments(
    output_dir=output_dir,                    # directory to save and repository id
    num_train_epochs=3,                       # number of training epochs
    per_device_train_batch_size=1,            # batch size per device during training
    gradient_accumulation_steps=8,            # number of steps before performing a backward/update pass
    gradient_checkpointing=True,              # use gradient checkpointing to save memory
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,                         # log every 10 steps
    learning_rate=2e-4,                       # learning rate, based on QLoRA paper
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,                        # max gradient norm based on QLoRA paper
    max_steps=-1,
    warmup_ratio=0.03,                        # warmup ratio based on QLoRA paper
    group_by_length=True,
    lr_scheduler_type="cosine",               # use cosine learning rate scheduler
    report_to="tensorboard",                  # report metrics to tensorboard
    evaluation_strategy="epoch"               # save checkpoint every epoch
)

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_data,
    eval_dataset=valid_data,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    max_seq_length=1024, # change to 164?
    packing=False,
    dataset_kwargs={
        "add_special_tokens": False,
        "append_concat_token": False,
    }
)

# Modeling

In [3]:
model_name = "meta-llama/Llama-2-7b-hf"
num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, token=os.environ['LLAMA_TOKEN'])

Downloading shards: 100%|██████████| 2/2 [01:13<00:00, 36.55s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.77it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.config.pad_token_id = model.config.eos_token_id

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ['LLAMA_TOKEN'])

In [43]:
import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import transformers
import pandas as pd
from datasets import load_dataset, load_metric, Dataset, DatasetDict
from tqdm import tqdm
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import LlamaTokenizer, get_scheduler, set_seed

from petals import DistributedLlamaForSequenceClassification

from preprocessing import get_small_df, preprocess_df
from utils import stratified_split, get_x_y, MAX_SEQ_LEN

set_seed(0)

In [2]:
df = pd.read_csv('data/transformed_wiki_data.csv', header=0)
df = get_small_df(df)

In [4]:
train_set, valid_set, test_set = stratified_split(df)

In [6]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"

# Choose a prompt-tuning mode ('ptune' or 'deep_ptune').
# The latter fine-tunes separate prefixes for each transformer block,
# so prompt-tuning will take more time but yield better results.
# See this paper for details of how it works: https://arxiv.org/pdf/2110.07602.pdf
TUNING_MODE = 'ptune'

NUM_PREFIX_TOKENS = 8
DEVICE = 'cuda'
BATCH_SIZE = 32
LR = 1e-2
WEIGHT_DECAY = 0.0
NUM_EPOCHS = 3
SEED = 42
MODEL_MAX_LENGTH = 64

In [7]:
tokenizer = LlamaTokenizer.from_pretrained(MODEL_NAME)
tokenizer.padding_side = 'right'
tokenizer.model_max_length = MODEL_MAX_LENGTH
tokenizer.pad_token = tokenizer.unk_token
model = DistributedLlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    pre_seq_len=NUM_PREFIX_TOKENS,
    tuning_mode=TUNING_MODE
).float().to(DEVICE)
model.config.pad_token_id = tokenizer.pad_token_id

May 03 12:05:40.003 [[1m[34mINFO[0m] Make sure you follow the LLaMA's terms of use: https://bit.ly/llama2-license for LLaMA 2, https://bit.ly/llama-license for LLaMA 1
May 03 12:05:40.004 [[1m[34mINFO[0m] Using DHT prefix: Llama-2-7b-hf
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  5.63it/s]
Some weights of DistributedLlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
task = 'sst2'

dataset = load_dataset("glue", task)
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [17]:
def preprocess_function(examples):
    return tokenizer(examples["sentence"], padding='max_length', truncation=True, return_token_type_ids=False)

tokenized_datasets = dataset.map(preprocess_function, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [19]:
train_ds = Dataset.from_pandas(train_set)
test_ds = Dataset.from_pandas(test_set)
valid_ds = Dataset.from_pandas(valid_set)

my_dataset = DatasetDict({'train': train_ds, 'valid': valid_ds, 'test': test_ds})
my_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__'],
        num_rows: 90000
    })
    valid: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__'],
        num_rows: 30000
    })
})

In [20]:
def my_preprocess_function(examples):
    return tokenizer(examples["text"], padding='max_length', truncation=True, return_token_type_ids=False)

my_tokenized_datasets = my_dataset.map(my_preprocess_function, batched=True)
my_tokenized_datasets

Map: 100%|██████████| 90000/90000 [02:13<00:00, 675.63 examples/s]
Map: 100%|██████████| 30000/30000 [00:44<00:00, 675.40 examples/s]
Map: 100%|██████████| 30000/30000 [00:44<00:00, 675.80 examples/s]


DatasetDict({
    train: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 90000
    })
    valid: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['title', 'text', 'generated', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 30000
    })
})

In [21]:
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx", "attention_mask"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids'],
        num_rows: 1821
    })
})

In [22]:
my_tokenized_datasets = my_tokenized_datasets.remove_columns(["text", "__index_level_0__", "attention_mask", "title"])
my_tokenized_datasets = my_tokenized_datasets.rename_column("generated", "labels")
my_tokenized_datasets.set_format("torch")
my_tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['title', 'labels', 'input_ids'],
        num_rows: 90000
    })
    valid: Dataset({
        features: ['title', 'labels', 'input_ids'],
        num_rows: 30000
    })
    test: Dataset({
        features: ['title', 'labels', 'input_ids'],
        num_rows: 30000
    })
})

In [24]:
train_dataset = tokenized_datasets["train"].shuffle(seed=SEED)
valid_dataset = tokenized_datasets["validation"].shuffle(seed=SEED)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=BATCH_SIZE)

In [25]:
my_train_dataset = my_tokenized_datasets["train"].shuffle(seed=SEED)
my_valid_dataset = my_tokenized_datasets["valid"].shuffle(seed=SEED)
my_test_dataset = my_tokenized_datasets["test"].shuffle(seed=SEED)

my_train_dataloader = DataLoader(my_train_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
my_valid_dataloader = DataLoader(my_valid_dataset, shuffle=True, batch_size=BATCH_SIZE, drop_last=True)
my_test_dataloader = DataLoader(my_test_dataset, batch_size=BATCH_SIZE)

In [26]:
metric = load_metric('glue', task)

def eval_metrics(model, dataloader, device='cpu'):
    model.eval()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
    model.train()
    return metric.compute()

  metric = load_metric('glue', task)
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 5.76kB [00:00, 13.6MB/s]                   


In [27]:
for n, p in model.named_parameters():
    if p.requires_grad:
        print(n, p.requires_grad, p.device)

model.prompt_embeddings.weight True cuda:0
score.weight True cuda:0


In [28]:
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * NUM_EPOCHS
)

In [45]:

config={
    "num_epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LR,
    "weight_decay": WEIGHT_DECAY,
    "num_prefix_tokens": NUM_PREFIX_TOKENS,
    "model_name": MODEL_NAME,
    "seed": SEED,
}

scaler = torch.cuda.amp.GradScaler()

for epoch in range(NUM_EPOCHS):
    model.train()
    for batch in tqdm(train_dataloader):
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        with torch.autocast(device_type=DEVICE, dtype=torch.float16):
          outputs = model(**batch)
        loss = outputs.loss
        scaler.scale(loss).backward()

        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()
        optimizer.zero_grad()

        print({"Train Loss": loss.detach()})

    accuracy = eval_metrics(model, valid_dataloader, device=DEVICE)
    print({"Valid Accuracy": accuracy})

  0%|          | 0/2104 [00:00<?, ?it/s]May 03 13:01:57.730 [[1m[38;5;208mWARN[0m] [[1mpetals.client.routing.sequence_manager.rpc_info:459[0m] Caught exception when gathering information from peer None (retry in 0 sec): MissingBlocksError("No servers holding blocks 0 are online. You can check the public swarm's state at https://health.petals.dev If there are not enough servers, please connect your GPU: https://github.com/bigscience-workshop/petals#connect-your-gpu-and-increase-petals-capacity ")
May 03 13:01:57.981 [[1m[38;5;208mWARN[0m] [[1mpetals.client.routing.sequence_manager.rpc_info:459[0m] Caught exception when gathering information from peer None (retry in 1 sec): MissingBlocksError("No servers holding blocks 0 are online. You can check the public swarm's state at https://health.petals.dev If there are not enough servers, please connect your GPU: https://github.com/bigscience-workshop/petals#connect-your-gpu-and-increase-petals-capacity ")
May 03 13:01:58.982 [[1m[3

KeyboardInterrupt: 

In [13]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import GridSearchCV
from dotenv import load_dotenv

from preprocessing import get_small_df, preprocess_df
from utils import stratified_split, get_x_y, MAX_SEQ_LEN

load_dotenv('.env')

True

In [11]:
model_name = "meta-llama/Llama-2-7b-hf"

In [9]:
df = pd.read_csv('data/transformed_wiki_data.csv', header=0)
df = get_small_df(df)
train_set, valid_set, test_set = stratified_split(df)

In [10]:
x_train = train_set['text']
x_test = test_set['text']
x_valid = valid_set['text']

y_train = np.array(train_set['generated'])
y_test = np.array(test_set['generated'])
y_valid = np.array(valid_set['generated'])

In [None]:
train_data = Dataset.from_pandas(x_train)
valid_data = Dataset.from_pandas(x_valid)
test_data = Dataset.from_pandas(x_valid)

In [16]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ['LLAMA_TOKEN'])
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, token=os.environ['LLAMA_TOKEN'])  # 2 for binary classification

# Tokenize inputs
train_encodings = tokenizer(x_train, truncation=True, padding=True)
valid_encodings = tokenizer(x_valid, truncation=True, padding=True)
test_encodings = tokenizer(x_test, truncation=True, padding=True)

# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

# Define a function to compute accuracy
def compute_accuracy(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return accuracy_score(labels, preds)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=valid_encodings,
    compute_metrics=compute_accuracy
)

# Define the parameter grid for grid search
param_grid = {
    "learning_rate": [1e-5, 2e-5],
    "weight_decay": [0.0, 0.01],
}

# Initialize GridSearchCV
grid_search = GridSearchCV(trainer, param_grid, scoring='accuracy', cv=3)

# Perform grid search
grid_search.fit(train_encodings['input_ids'], np.array(y_train))

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Train the model with best parameters on the full training data
best_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, **best_params)
trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=train_encodings,
    eval_dataset=valid_encodings,
    compute_metrics=compute_accuracy
)
trainer.train()

# Evaluate the best model on the validation set
eval_results = trainer.evaluate(eval_dataset=valid_encodings)
val_accuracy = eval_results['eval_accuracy']
print("Validation Accuracy:", val_accuracy)

# Optionally, you can test the best model on the test set
test_encodings = tokenizer(X_test, truncation=True, padding=True)
test_results = trainer.evaluate(eval_dataset=test_encodings)
test_accuracy = test_results['eval_accuracy']
print("Test Accuracy:", test_accuracy)

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.06it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).