In [None]:
# Install all the requried libraries

!pip install -q accelerate==0.21
!pip install -q peft==0.4.0
!pip install -q bitsandbytes==0.40.2
!pip install -q transformers==4.33.1 
!pip install -q trl==0.4.7

In [None]:
# Import all the required libraries

import numpy as np
import pandas as pd
import torch.nn as nn
import bitsandbytes as bnb
import re
import os

import torch
import transformers

from tqdm import tqdm
from datasets import Dataset
from peft import LoraConfig, PeftConfig
from trl import SFTTrainer
from transformers import (AutoModelForCausalLM,
                          AutoTokenizer,
                          BitsAndBytesConfig,
                          TrainingArguments,
                          pipeline,
                          logging)
from sklearn.metrics import (accuracy_score,
                             classification_report,
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [None]:
# Read in the data

df_train = pd.read_csv('/kaggle/input/input-data/HODI_2023_train_subtaskA.tsv', sep='\t')
df_test = pd.read_csv('/kaggle/input/input-data/HODI_2023_test_subtaskA.csv', sep='\t')

In [None]:
# Pre-process the text (same function as the BERT fine-tuning)

def pre_process(text):
    text = text.lower()
    text = re.sub(r"(?:\@|https?\://)\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('#', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('RT|rt', '', text)
    text = text.strip()
    return text

In [None]:
# Apply it to data

df_train["text"] = df_train["text"].apply(pre_process)
df_test["text"] = df_test["text"].apply(pre_process)

In [None]:
# Import the ground truth labels from gold file

ground_truth_labels = pd.read_csv("/kaggle/input/input-data/HODI_2023_test_GOLD.tsv", sep="\t")

In [None]:
# Merge this with the test data (that was originally without labels)

df_test = pd.merge(df_test, ground_truth_labels[['id', 'homotransphobic']], on='id', how='left')

In [None]:
def train_eval_prompt(data):
    return f"""
            ### Instruction: 
            Analyze the Italian sentence enclosed in square brackets in the Input 
            classify whether it is homotransphobic or not.
            Return in the Response "0" if it isn't homotransphobic and "1" if it is.
            
            ### Input: 
            [{data["text"]}]
            
            ### Response:
            {data['homotransphobic']}
            """
    
def test_prompt(data):
    return f"""
            ### Instruction: 
            Analyze the Italian sentence enclosed in square brackets in the Input 
            classify whether it is homotransphobic or not.
            Return in the Response "0" if it isn't homotransphobic and "1" if it is.
            
            ### Input: 
            [{data["text"]}]
            
            ### Response:
            """

In [None]:
# Get validation set
train_data, eval_data = train_test_split(df_train, test_size=0.2, random_state=42)

# Apply pre-processing to the text column
train_data["text"] = train_data["text"].apply(pre_process)
eval_data["text"] = eval_data["text"].apply(pre_process)

# Apply the generate prompt function to the training dataset
train_data['text'] = train_data.apply(train_eval_prompt, axis=1)

# Apply the generate prompt function to the evaluation and test dataset
eval_data['text'] = eval_data.apply(train_eval_prompt, axis=1)

df_test["text"] = df_test.apply(test_prompt, axis=1)

# Import the labels
y_true = df_test['homotransphobic'].astype(str)

In [None]:
def evaluate(y_true, y_pred):
    # Convert string labels to numeric
    y_true = np.array(y_true, dtype=int)
    y_pred = np.array(y_pred, dtype=int)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    unique_labels = set(y_true)

    # Generate accuracy report
    for label in unique_labels:
        label_indices = np.where(y_true == label)[0]
        label_y_true = y_true[label_indices]
        label_y_pred = y_pred[label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report with specified labels
    class_report = classification_report(y_true=y_true, y_pred=y_pred, target_names=['1', '0'])
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred)
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
# Import the LLaMA 2 7b model from HuggingFace

model_name = ("meta-llama/Llama-2-7b-chat-hf")

In [None]:
# Configure optimisation/quantization parameters
compute_dtype = getattr(torch, "float16")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

# Load model and tokenizer with quantization
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    token='',
    device_map="auto",
    quantization_config=bnb_config,
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token='',
                                          trust_remote_code=True,
                                         )
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
def predict(samples, model, tokenizer):
    
    '''
    This is a defined custom function to do the predictions on the test
    set after having trained our model. We extract the classification
    from the model's response following the defined template.
    '''
    
    y_pred = []
    for _, row in tqdm(samples.iterrows(), total=len(samples)):
        prompt = row["text"]

        pipe = pipeline(task="text-generation",
                        model=model,
                        tokenizer=tokenizer,
                        max_new_tokens=1
                       )
        
        result = pipe(prompt)

        generated_text = result[0]['generated_text']

        # Find the position of '### Response:'
        response_index = generated_text.find('### Response:')

        # Extract the text after '### Response:'
        response_text = generated_text[response_index + len('### Response:'):].strip()

        y_pred.append(response_text)

    return y_pred

In [None]:
# Zero-shot classification attempt

y_pred = predict(df_test, model, tokenizer)
evaluate(y_true, y_pred)

# Fine Tuning

In [None]:
# Configuring PEFT

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [None]:
# Training arguments

training_arguments = TrainingArguments(
    output_dir="logs",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8, # 4
    optim="paged_adamw_32bit",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="tensorboard",
    evaluation_strategy="epoch"
)

In [None]:
# Create a Dataset object for training

train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)

In [None]:
# Define optimised training using SFTT

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=False,
    max_seq_length=1024,
)

In [None]:
# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained("trained-model1")

In [None]:
# Predict on test set and evaluate

y_pred = predict(df_test, model, tokenizer)
evaluate(y_true, y_pred)

In [None]:
'''

The following code will create a Pandas DataFrame called evaluation containing the text,
true labels, and predicted labels from the test set. This is expectially useful for understanding
 the errors that the fine-tuned model makes, and gettting insights on how to improve the prompt.

'''

evaluation = pd.DataFrame({'text': df_test["text"],
                           'y_true':y_true,
                           'y_pred': y_pred},
                         )
evaluation.to_csv("test_predictions.csv", index=False)