## Imports and installs

In [None]:
!pip install datasets evaluate tqdm -q
!pip install -U accelerate --quiet

In [None]:
import pandas as pd
import numpy as np
from transformers import (AutoTokenizer,
                          DataCollatorWithPadding, 
                          BloomTokenizerFast,
                          BloomForTokenClassification,
                          BloomForSequenceClassification,
                          DataCollatorForTokenClassification, 
                          AutoModelForTokenClassification,  
                          BloomForCausalLM,
                          TrainingArguments, Trainer,
                         pipeline)
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import os
import evaluate
import random
from tqdm import tqdm
import difflib

In [None]:
os.environ["WANDB_API_KEY"] = "174605229344dbcc2c90f595394111e3396b2b8b"

## Model : BLOOM-560m

In [None]:
model_path = "bigscience/bloomz-560m"
tokenizer = BloomTokenizerFast.from_pretrained(model_path)

## Fine-tune the model
Since this kind of behavior isn't acceptable, let's fine-tune the model on the CrowS-Pairs dataset

### Preprocess the dataset

In [None]:
crows_pairs = pd.read_csv("/kaggle/input/a-dataset-for-measuring-social-biases-in-mlms/crows_pairs_anonymized.csv")

In [None]:
dataset = pd.DataFrame()

dataset['text'] = np.where(crows_pairs['stereo_antistereo'] == 'stereo', crows_pairs['sent_more'], crows_pairs['sent_less'])
dataset['answer'] = np.where(crows_pairs['stereo_antistereo'] == 'stereo', crows_pairs['sent_less'], crows_pairs['sent_more'])

dataset = Dataset.from_pandas(dataset)
dataset

### Get to work

In [None]:
dataset = dataset.shuffle(seed=2023)
dataset = dataset.train_test_split(test_size=0.05)

In [None]:
dataset["test"]["text"][0]

In [None]:
dataset["test"]["answer"][0]

In [None]:
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["text"], truncation=True), 
                                                                                 batched=True, remove_columns=["text", "answer"])
max_source_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_targets = concatenate_datasets([dataset["train"], dataset["test"]]).map(lambda x: tokenizer(x["answer"], truncation=True), 
                                                                                  batched=True, remove_columns=["text", "answer"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

In [None]:
def get_word(words1, words2):
    # Find the differences between the two sentences
    diff = difflib.ndiff(words1, words2)

    # Initialize variables to store the results
    different_word1 = ""
    different_word2 = ""
    modified_sentence = ""

    # Iterate over the differences
    for item in diff:
        if item.startswith('- '):
            # Word present in sentence1 but not in sentence2
            if different_word1 != "":
                different_word1 = different_word1 + " " + item[2:]
            else:
                different_word1 = item[2:]
                modified_sentence += " [BLANK]"
        elif item.startswith('+ '):
            # Word present in sentence2 but not in sentence1
            if different_word2 != "":
                different_word2 = different_word2 + " " + item[2:]
            else:
                different_word2 = item[2:]
            modified_sentence += ""
        elif item.startswith(' '):
            # Word present in both sentences
            modified_sentence += " " + item[2:]
    
    return modified_sentence.strip(), different_word2.strip()
                

def preprocess_function(examples, padding="max_length"):
    template_start = "Fill in the following sentence. "
    inputs = []
    labels = []
    
    for text_item, answer_item in zip(examples["text"], examples["answer"]):
        input_sentence, diff_word = get_word(text_item.split(), answer_item.split())
        inputs.append(template_start + input_sentence)
        labels.append(diff_word)
        
    model_inputs = tokenizer(inputs, max_length=50, padding=padding, truncation=True)
    
    labels = tokenizer(labels, max_length=50, padding=padding, truncation=True)
    
    if padding == "max_length":
        labels["input_ids"] = [[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]]
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    bleu = evaluate.load("bleu")
    return bleu.compute(predictions=predictions, references=labels)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [None]:
model = BloomForCausalLM.from_pretrained(model_path)

In [None]:
training_args = TrainingArguments(
    output_dir="OutModelPolicy",
    learning_rate= 3e-05,
    per_device_train_batch_size=20,
    per_device_eval_batch_size=20, 
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    save_steps= 50000,
    eval_steps= 50000,
    fp16 = True,
    save_total_limit = 2, 
    push_to_hub=False,
)
# crashes with 35 batch size

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("politeBLOOM")

In [None]:
import shutil
shutil.make_archive('finetuned_bloomz', 'zip', '/kaggle/working/')

In [None]:
saved_model=model = BloomForCausalLM.from_pretrained("/kaggle/working/politeBLOOM")

In [None]:
crows_pairs.head()

In [None]:
crows_pairs["sent_less"][3]

In [None]:
crows_pairs["sent_more"][3]

In [None]:
idx = 2
prompt_for_inference = f'''
    Read the text very carefully and you have complete text very carefully

    Text:
    {crows_pairs.iloc[idx].sent_less}

    Task:
'''
prompt_for_inference

In [None]:
device = "cuda"
inputs = tokenizer(prompt_for_inference,
                   return_token_type_ids=False,
                   return_tensors="pt").to(device)

In [None]:
saved_model.to(device)
outputs = saved_model.generate(**inputs) #max_new_tokens=150)

In [None]:
print(tokenizer.decode(outputs[0], skip_special_tokens=False))