# **Sentiment Index Generation: Llama**

# **All Installations**

In [1]:
%pip install transformers datasets peft accelerate evaluate torch --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.2.1-py3-none-any.whl.metadata (25 kB)
Downloading python_dotenv-1.2.1-py3-none-any.whl (21 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.2.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import platform

def data_download(file_to_download, gdrive_code, OS, uncompress = True):
  if not os.path.exists(file_to_download):
    os.system('gdown --id "'+gdrive_code+'" --output '+file_to_download)
    if OS == "Linux" and uncompress:
        os.system('unzip -o -n "./'+file_to_download+'" -d "./"')
    return True
  else:
    return None

OS = platform.system()

out = data_download("./Sentiment_Dataset.zip", "1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR", OS)

Downloading...
From (original): https://drive.google.com/uc?id=1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR
From (redirected): https://drive.google.com/uc?id=1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR&confirm=t&uuid=834395b1-f2b3-444b-9c6d-9314b4cf94d1
To: /mnt/batch/tasks/shared/LS_root/mounts/clusters/frpuglie2-a100single/code/Users/frpuglie/talks_material/Sentiment_Dataset.zip
100%|██████████| 1.21G/1.21G [00:12<00:00, 97.6MB/s]


Archive:  ././Sentiment_Dataset.zip
   creating: ./Sentiment_Dataset/
 extracting: ./Sentiment_Dataset/.env  
   creating: ./Sentiment_Dataset/2020_index_tweets_set/
   creating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-01.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-02.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-03.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-04.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-05.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-06.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-07.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social

# **All Imports**

In [4]:
#General
import torch
import random
import numpy as np
import pandas as pd
from dotenv import load_dotenv

# LLMs
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# **All Globals**

In [5]:
seed = 42
dataset_limit = 100
llm_model_parameters_number = '1B'                # number of billion parameters of the LLM mode, 1B is the smaller llama3.2 1 billion of parameters
model_name = f"meta-llama/Llama-3.2-{llm_model_parameters_number}"
dataset_path = "./Sentiment_Dataset/"

# **Initializations**

In [6]:
os.environ["WANDB_MODE"] = "disabled"

!mv "{dataset_path}.env" ./
random.seed(seed)
np.random.seed(seed)

load_dotenv()

True

# **Functions Definition**

In [7]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

# **Dataset Reading**

In [9]:
pwd

'/mnt/batch/tasks/shared/LS_root/mounts/clusters/frpuglie2-a100single/code/Users/frpuglie/talks_material'

In [10]:
irony_df = pd.read_csv("./Sentiment_Dataset/irony_dataset.txt", header=None).rename(columns={0: 'text'})["text"].to_frame()

In [11]:
df = pd.read_csv("./Sentiment_Dataset/training_set.csv", sep = ";")
df = df[["text", "pol"]].dropna()         # select the columns interesting for us and drop rows with NaN

FileNotFoundError: [Errno 2] No such file or directory: '/content/Sentiment_Dataset/training_set.csv'

In [None]:
if dataset_limit is not None:
    df = df.sample(dataset_limit).reset_index(drop=True)

In [None]:
print("Numnber of data-points:", len(df))

In [None]:
df.head()

1: 'Positive', 2: 'Negative', 0: 'Neutral'

In [None]:
print("Number of classes", df["pol"].nunique())
print("Types of classes", df["pol"].unique())

# **Dataset Recoding**

In [None]:
df['pol'] = df['pol'].map({1: 'Positive', 2: 'Negative', 0: 'Neutral'})
df = df.rename(columns={'pol': 'label'})

In [None]:
df.head()

# **Label Conversion**

In [None]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
np.save("labels_mapping.npy", label_mapping)
print("Label mapping:", label_mapping)

# **Train-Test Split**

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# **Dataset Conversion for Transformers**

In [None]:
irony_dataset = Dataset.from_pandas(irony_df)
tokenized_irony = irony_dataset.map(tokenize_function, batched=True)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
unique_labels = train_dataset.unique('label')
num_labels = len(unique_labels)
print("Number of Labels withing the Training Set:", len(unique_labels))

In [None]:
train_dataset

In [None]:
test_dataset

# **Tokenization and Padding**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

## **Load the LLM Model**

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

# **Definition of QLora**

In [None]:
# === 7. Applica LoRA (fine-tuning efficiente) ===
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, peft_config)

 # **Metrics Definition**

In [None]:
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir="./llama3_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to=[]  # Disactivate the W&B
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
results = trainer.evaluate()
print(f"\n✅ Accuracy sul test set: {results['eval_accuracy']:.4f}")

In [None]:
predictions = trainer.predict(tokenized_irony)
predicted_labels = predictions.predictions.argmax(-1)
label_mapping = np.load("labels_mapping.npy", allow_pickle=True).item()
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
predicted_sentiment = [reverse_label_mapping[label] for label in predicted_labels]
irony_df['predicted_sentiment'] = predicted_sentiment
display(irony_df.head())