In [1]:
!pip install -q bitsandbytes accelerate datasets evaluate rouge_score
!pip install -q git+https://github.com/huggingface/peft.git@main

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.0/105.0 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Installing

In [10]:
from transformers import (AutoTokenizer,
                          AutoModelForSeq2SeqLM,
                          Seq2SeqTrainingArguments,
                          Seq2SeqTrainer,
                          DataCollatorForSeq2Seq)

from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, TaskType
from datasets import Dataset
from sklearn.metrics import classification_report

import nltk
import os
import pandas as pd
import numpy as np
import evaluate
import random

%load_ext autoreload
%autoreload 2
from few_shot_testing import load_data


In [None]:
model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True)
model = prepare_model_for_int8_training(model)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


lora_config = LoraConfig(
    r=32, lora_alpha=64,
    target_modules=["q", "v", "k", "o", "wi_0", "wi_1"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)


model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainable params: 30670848 || all params: 813820928 || trainable%: 3.768746531915188


In [16]:
few_shot_samples = 51
max_len = 512
model_name = None

seed = 77
num_of_runs = 3
stratify_seeds = [77, 88, 99]

train_ds, test_ds = load_data("train_data.csv",
                              "test_data.csv",
                              few_shot_samples,
                              tokenizer,
                              512,
                              model_name,
                              stratify_seeds[0],
                              text2text=True)

int2str = {-1: 'negative', 0: 'neutral', 1: 'positive'}
str2int = {v:k for k, v in int2str.items()}

In [17]:
def generate_sample(text):
  return f"""Perform Sentiment classification task.
Given the text assign a sentiment label from ['negative', 'positive', 'neutral'].
Return label only without any other text.

<text>: {text}
<sentiment>: """.strip()

def preprocess_function(examples):
  inputs = [generate_sample(text) for text in examples['text']]
  model_inputs = tokenizer(inputs, max_length=512,  truncation=True)

  # The labels are tokenized outputs
  labels = tokenizer(text_target=examples['label'],
                     max_length=512,
                     truncation=True)

  model_inputs['labels'] = labels['input_ids']

  return model_inputs


tokenized_train_dataset = train_ds.map(preprocess_function, batched=True)
tokenized_test_dataset = test_ds.map(preprocess_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'label'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['text', 'label'])

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
def generate_sample(text):
  return f"""Perform Sentiment classification task.
Given the text assign a sentiment label from ['negative', 'positive', 'neutral'].
Return label only without any other text.

<text>: {text}
<sentiment>: """.strip()

def preprocess_function(examples):
  inputs = [generate_sample(text) for text in examples['text']]
  model_inputs = tokenizer(inputs, max_length=512,  truncation=True)

  # The labels are tokenized outputs
  labels = tokenizer(text_target=examples['label'],
                     max_length=512,
                     truncation=True)

  model_inputs['labels'] = labels['input_ids']

  return model_inputs

int2str = {-1: 'negative', 0: 'neutral', 1: 'positive'}
str2int = {v:k for k, v in int2str.items()}

train_df, test_df = pd.read_csv("train_data.csv"), pd.read_csv("test_data.csv")
train_df['label'] = train_df['label'].map(int2str)
test_df['label'] = test_df['label'].map(int2str)

train_dataset = Dataset.from_pandas(train_df)
test_dataset =  Dataset.from_pandas(test_df)

tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)
tokenized_train_dataset = tokenized_train_dataset.remove_columns(['text', 'label'])
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['text', 'label'])

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [19]:
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
   preds, labels = eval_preds

   # decode preds and labels
   labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
   decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
   decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

   # rougeLSum expects newline after each sentence
   decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
   decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

   result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

   return result

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [20]:
training_args = Seq2SeqTrainingArguments(
   output_dir="./results",
   evaluation_strategy="epoch",
   learning_rate=2e-5,
   per_device_train_batch_size=8,
   per_device_eval_batch_size=8,
   weight_decay=0.01,
   save_total_limit=3,
   num_train_epochs=3,
   predict_with_generate=True,
   push_to_hub=False
)

In [21]:
trainer = Seq2SeqTrainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train_dataset,
   eval_dataset=tokenized_test_dataset,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics
)

In [22]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,0.338208,0.742857,0.0,0.742857,0.742857
2,No log,0.335532,0.742857,0.0,0.742857,0.742857
3,No log,0.334546,0.757143,0.0,0.757143,0.757143




TrainOutput(global_step=21, training_loss=0.4268137613932292, metrics={'train_runtime': 104.8262, 'train_samples_per_second': 1.46, 'train_steps_per_second': 0.2, 'total_flos': 232120908103680.0, 'train_loss': 0.4268137613932292, 'epoch': 3.0})

In [25]:
from tqdm import tqdm

def run_on_test(test_dataset):
  golden_labels = []
  predicted_labels = []

  test_texts, test_labels = test_dataset['text'], test_dataset['label']
  inputs = [generate_sample(text) for text in test_texts]

  # Running inference on 1 sample at a time to avoid OOM issue
  for i, input in enumerate(tqdm(inputs)):
    input = tokenizer(input, return_tensors='pt')
    output = model.generate(**input)

    golden_labels.append(str2int[test_labels[i]])
    predicted_labels.append(str2int[tokenizer.decode(output[0], skip_special_tokens=True)])

    #print(f'Generated label" {tokenizer.decode(output[0], skip_special_tokens=True)}', end=" | ")
    #print(f'Golden label {test_labels[i]}', end='\n\n')
  return golden_labels, predicted_labels

In [28]:
golden_labels, predicted_labels = run_on_test(test_ds)

 47%|████▋     | 33/70 [00:25<00:27,  1.36it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (687 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 70/70 [00:47<00:00,  1.48it/s]


In [29]:
print(classification_report(golden_labels, predicted_labels, digits=4))


              precision    recall  f1-score   support

          -1     0.8095    0.8095    0.8095        21
           0     0.8750    0.5385    0.6667        26
           1     0.6667    0.9565    0.7857        23

    accuracy                         0.7571        70
   macro avg     0.7837    0.7682    0.7540        70
weighted avg     0.7869    0.7571    0.7486        70



---> (more parameters)

macro

0.7565 | 0.7870 | 0.8305

micro

0.7505 | 0.7825 | 0.8279