# **Sentiment Index Generation: Llama**

# **All Installations**

In [1]:
%pip install transformers datasets peft accelerate evaluate torch --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%pip install python-dotenv



In [3]:
import os
import platform

def data_download(file_to_download, gdrive_code, OS, uncompress = True):
  if not os.path.exists(file_to_download):
    os.system('gdown --id "'+gdrive_code+'" --output '+file_to_download)
    if OS == "Linux" and uncompress:
        os.system('unzip -o -n "./'+file_to_download+'" -d "./"')
    return True
  else:
    return None

OS = platform.system()

out = data_download("./Sentiment_Dataset.zip", "1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR", OS)

# **All Imports**

In [4]:
#General
import torch
import random
import calendar
import numpy as np
import pandas as pd
from dotenv import load_dotenv

# LLMs
import evaluate
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# **All Globals**

In [47]:
seed = 42
dataset_limit = 100
sentiment_index_limit = 5
n_tweets_per_day = 10
llm_model_parameters_number = '1B'                # number of billion parameters of the LLM mode, 1B is the smaller llama3.2 1 billion of parameters
model_name = f"meta-llama/Llama-3.2-{llm_model_parameters_number}"
dataset_path = "./Sentiment_Dataset/"

# **Initializations**

In [6]:
os.environ["WANDB_MODE"] = "disabled"

!mv "{dataset_path}.env" ./
random.seed(seed)
np.random.seed(seed)

load_dotenv()

True

# **Functions Definition**

In [7]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

def df_predictions(df, tokenized_df, trainer, label_mapping):
    predictions = trainer.predict(tokenized_df)
    predicted_labels = predictions.predictions.argmax(-1)
    label_mapping = np.load("labels_mapping.npy", allow_pickle=True).item()
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = [reverse_label_mapping[label] for label in predicted_labels]
    df['predicted_sentiment'] = predicted_sentiment
    return df

def month_sort_key(month_dir):
    month_name = os.path.basename(month_dir).split('_')[0]
    try:
        return list(calendar.month_name).index(month_name.capitalize())
    except ValueError:
        return 0

# **Dataset Reading**

In [8]:
irony_df = pd.read_csv("./Sentiment_Dataset/irony_dataset.txt", header=None).rename(columns={0: 'text'})["text"].to_frame()

In [9]:
df = pd.read_csv("./Sentiment_Dataset/training_set.csv", sep = ";")
df = df[["text", "pol"]].dropna()         # select the columns interesting for us and drop rows with NaN

In [10]:
if dataset_limit is not None:
    df = df.sample(dataset_limit).reset_index(drop=True)

In [11]:
print("Numnber of data-points:", len(df))

Numnber of data-points: 100


In [12]:
df.head()

Unnamed: 0,text,pol
0,Arbasina/Giavazzi sul Corriere a proposito del...,0.0
1,il trading è un amp azienda ecco come risparmi...,1.0
2,"RT TAG mi hanno unfollowato in massa, mortacci...",2.0
3,modalità prezzi e biglietti per la gara al man...,0.0
4,donnastef shop t shirt gonna borsa spedizione ...,0.0


1: 'Positive', 2: 'Negative', 0: 'Neutral'

In [13]:
print("Number of classes", df["pol"].nunique())
print("Types of classes", df["pol"].unique())

Number of classes 3
Types of classes [0. 1. 2.]


# **Dataset Recoding**

In [14]:
df['pol'] = df['pol'].map({1: 'Positive', 2: 'Negative', 0: 'Neutral'})
df = df.rename(columns={'pol': 'label'})

In [15]:
df.head()

Unnamed: 0,text,label
0,Arbasina/Giavazzi sul Corriere a proposito del...,Neutral
1,il trading è un amp azienda ecco come risparmi...,Positive
2,"RT TAG mi hanno unfollowato in massa, mortacci...",Negative
3,modalità prezzi e biglietti per la gara al man...,Neutral
4,donnastef shop t shirt gonna borsa spedizione ...,Neutral


# **Label Conversion**

In [16]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
np.save("labels_mapping.npy", label_mapping)
print("Label mapping:", label_mapping)

Label mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


# **Train-Test Split**

In [17]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# **Dataset Conversion for Transformers**

In [18]:
irony_dataset = Dataset.from_pandas(irony_df)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
unique_labels = train_dataset.unique('label')
num_labels = len(unique_labels)
print("Number of Labels withing the Training Set:", len(unique_labels))

Number of Labels withing the Training Set: 3


In [19]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 80
})

In [20]:
test_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 20
})

# **Tokenization and Padding**

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

In [22]:
tokenized_irony = irony_dataset.map(tokenize_function, batched=True)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/80 [00:00<?, ? examples/s]

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

## **Load the LLM Model**

In [23]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Definition of QLora**

In [24]:
# === 7. Applica LoRA (fine-tuning efficiente) ===
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, peft_config)

 # **Metrics Definition**

In [25]:
accuracy = evaluate.load("accuracy")

Downloading builder script: 0.00B [00:00, ?B/s]

In [26]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

# **Training Parameters Definition**

In [27]:
training_args = TrainingArguments(
    output_dir="./llama3_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to=[]  # Disactivate the W&B
)

# **Training Model Definition**

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [29]:
#irony_df = df_predictions(irony_df, tokenized_irony, trainer, label_mapping)

In [30]:
#irony_df

In [31]:
'''
negative_count = irony_df[irony_df['predicted_sentiment'] == 'Negative'].shape[0]
total_count = irony_df.shape[0]
percentage_negative = (negative_count / total_count) * 100

print(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")
'''

'\nnegative_count = irony_df[irony_df[\'predicted_sentiment\'] == \'Negative\'].shape[0]\ntotal_count = irony_df.shape[0]\npercentage_negative = (negative_count / total_count) * 100\n\nprint(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")\n'

# **Model Training**

In [32]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.749219,0.25
2,1.502700,1.737109,0.25
3,1.389900,1.732422,0.25


TrainOutput(global_step=120, training_loss=1.4213302612304688, metrics={'train_runtime': 116.2679, 'train_samples_per_second': 2.064, 'train_steps_per_second': 1.032, 'total_flos': 359059182059520.0, 'train_loss': 1.4213302612304688, 'epoch': 3.0})

# **Model Evaluation**

In [33]:
results = trainer.evaluate()
print(f"\n✅ Accuracy sul test set: {results['eval_accuracy']:.4f}")


✅ Accuracy sul test set: 0.2500


# **Model Testing on Irony Dataset**

In [34]:
irony_df = df_predictions(irony_df, tokenized_irony, trainer, label_mapping)

In [35]:
irony_df

Unnamed: 0,text,predicted_sentiment
0,"Oh, che meraviglia, ancora una volta il treno ...",Negative
1,La mia giornata è stata così fantastica che st...,Negative
2,Sono così grato al mio capo per avermi assegna...,Negative
3,"Wow, sono così felice che il mio vicino abbia ...",Negative
4,Adoro quando il mio computer decide di bloccar...,Negative
...,...,...
66,Finalmente ho trovato il tempo per riposarmi e...,Negative
67,La mia squadra del cuore ha appena pareggiato ...,Negative
68,"Oh, fantastico, ho dimenticato il mio portafog...",Negative
69,Sono così fortunato ad avere una sveglia così ...,Negative


In [36]:
negative_count = irony_df[irony_df['predicted_sentiment'] == 'Negative'].shape[0]
total_count = irony_df.shape[0]
percentage_negative = (negative_count / total_count) * 100

print(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")

Percentage of negative sentiments detected in irony_df after training: 100.00%


# **Sentiment Index Creation**

## **Tweets Production Set Reading**

In [48]:
all_dataframes = []
directory_path = "./Sentiment_Dataset/2020_index_tweets_set/"
month_directories = [os.path.join(directory_path, d) for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
month_directories.sort(key=month_sort_key) # Sort the directories chronologically

for month_dir in month_directories:
    print("Month File: ", month_dir)
    csv_files = [f for f in os.listdir(month_dir) if f.endswith('.csv')]
    # Sort csv files by date in filename
    csv_files.sort(key=lambda x: x.split('.')[0])
    for file_name in csv_files:
        print("Day File: ", file_name)
        file_path = os.path.join(month_dir, file_name)
        try:
            df = pd.read_csv(file_path, encoding='utf-8', nrows = n_tweets_per_day, sep = ';')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='latin1', nrows = n_tweets_per_day, sep = ';')
        all_dataframes.append((file_name.split('.')[0], df["testo"]))

Month File:  ./Sentiment_Dataset/2020_index_tweets_set/jan_tweets_social_mood
Day File:  2020-01-01.csv
Day File:  2020-01-02.csv
Day File:  2020-01-03.csv
Day File:  2020-01-04.csv
Day File:  2020-01-05.csv
Day File:  2020-01-06.csv
Day File:  2020-01-07.csv
Day File:  2020-01-08.csv
Day File:  2020-01-09.csv
Day File:  2020-01-10.csv
Day File:  2020-01-11.csv
Day File:  2020-01-12.csv
Day File:  2020-01-13.csv
Day File:  2020-01-14.csv
Day File:  2020-01-15.csv
Day File:  2020-01-16.csv
Day File:  2020-01-17.csv
Day File:  2020-01-18.csv
Day File:  2020-01-19.csv
Day File:  2020-01-20.csv
Day File:  2020-01-21.csv
Day File:  2020-01-22.csv
Day File:  2020-01-23.csv
Day File:  2020-01-24.csv
Day File:  2020-01-25.csv
Day File:  2020-01-26.csv
Day File:  2020-01-27.csv
Day File:  2020-01-28.csv
Day File:  2020-01-29.csv
Day File:  2020-01-30.csv
Day File:  2020-01-31.csv
Month File:  ./Sentiment_Dataset/2020_index_tweets_set/february_tweets_social_mood
Day File:  2020-02-01.csv
Day Fil

In [49]:
all_dataframes[0]

('2020-01-01',
 0    per la cronaca canova fu uno dei primi monumen...
 1    l opera d arte del giorno antonio canova amore...
 2    rt borghi claudio iniziare l anno con uno che ...
 3    tutta salute approda in seconda serata per ini...
 4    un anno di lavoro se permetti lo auguro io a t...
 5    senatore salvini sicuro che con quota restitui...
 6    aleguerani davidevignes luzziandrea certamente...
 7    sto andando al pranzo di famiglia in mezzo ai ...
 8    la raggi chiude l anno con bilancio in positiv...
 9    ullallà il simpatico senatore castiello che ha...
 Name: testo, dtype: object)

In [50]:
daily_sentiment_counts = []

i = 0
for date, series in all_dataframes:
    print(f"Computing day {date}")
    df = series.to_frame(name='text')
    dataset = Dataset.from_pandas(df)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    predictions = trainer.predict(tokenized_dataset)
    predicted_labels = predictions.predictions.argmax(-1)
    label_mapping = np.load("labels_mapping.npy", allow_pickle=True).item()
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = [reverse_label_mapping[label] for label in predicted_labels]
    positive_count = predicted_sentiment.count('Positive')
    negative_count = predicted_sentiment.count('Negative')
    daily_sentiment_counts.append({'date': date, 'positive': positive_count, 'negative': negative_count, "sentiment_index": (positive_count-negative_count)/(positive_count+negative_count)})
    i += 1
    if i>=sentiment_index_limit:
      break

sentiment_counts_df = pd.DataFrame(daily_sentiment_counts)
display(sentiment_counts_df)

Computing day 2020-01-01


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Computing day 2020-01-02


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Computing day 2020-01-03


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Computing day 2020-01-04


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Computing day 2020-01-05


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Unnamed: 0,date,positive,negative,sentiment_index
0,2020-01-01,6,2,0.5
1,2020-01-02,7,2,0.555556
2,2020-01-03,10,0,1.0
3,2020-01-04,9,1,0.8
4,2020-01-05,7,1,0.75


In [None]:
import matplotlib.pyplot as plt

# Convert 'date' column to datetime objects for proper plotting
sentiment_counts_df['date'] = pd.to_datetime(sentiment_counts_df['date'])

plt.figure(figsize=(12, 6))
plt.plot(sentiment_counts_df['date'], sentiment_counts_df['sentiment_index'])
plt.xlabel('Date')
plt.ylabel('Sentiment Index')
plt.title('Daily Sentiment Index')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()