# **Sentiment Index Generation: Llama**

# **All Installations**

In [1]:
%pip install transformers datasets peft accelerate evaluate torch --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install plotly

Note: you may need to restart the kernel to use updated packages.


In [4]:
import os
import platform

def data_download(file_to_download, gdrive_code, OS, uncompress = True):
  if not os.path.exists(file_to_download):
    os.system('gdown --id "'+gdrive_code+'" --output '+file_to_download)
    if OS == "Linux" and uncompress:
        os.system('unzip -o -n "./'+file_to_download+'" -d "./"')
    return True
  else:
    return None

OS = platform.system()

out = data_download("./Sentiment_Dataset.zip", "1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR", OS)

Downloading...
From (original): https://drive.google.com/uc?id=1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR
From (redirected): https://drive.google.com/uc?id=1OEQpkzN6HMcZc3yPg8G0ahvh6lOCMlIR&confirm=t&uuid=3c2baebc-bee3-4c42-869b-d32035cd7cfc
To: /mnt/batch/tasks/shared/LS_root/mounts/clusters/frpuglie2-a100single/code/Users/frpuglie/talks_material/Sentiment_Dataset.zip
100%|██████████| 1.21G/1.21G [00:09<00:00, 128MB/s] 


Archive:  ././Sentiment_Dataset.zip
   creating: ./Sentiment_Dataset/
 extracting: ./Sentiment_Dataset/.env  
   creating: ./Sentiment_Dataset/2020_index_tweets_set/
   creating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-01.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-02.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-03.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-04.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-05.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-06.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social_mood/2020-04-07.csv  
  inflating: ./Sentiment_Dataset/2020_index_tweets_set/april_tweets_social

# **All Imports**

In [5]:
#General
import timeit
import torch
import random
import calendar
import numpy as np
import pandas as pd
from dotenv import load_dotenv
import plotly.express as px
import shutil

# LLMs
import evaluate
from datasets import Dataset
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model

# Machine Learning
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# **All Globals**

In [43]:
seed = 42
dataset_limit = 3000
sentiment_index_limit = 100
n_tweets_per_day = 1000
n_epochs = 10
moving_average_window = 7
llm_model_parameters_number = '1B'                # number of billion parameters of the LLM mode, 1B is the smaller llama3.2 1 billion of parameters
#model_name = f"meta-llama/Llama-3.2-{llm_model_parameters_number}-Instruct"
model_name = f"meta-llama/Llama-3.2-{llm_model_parameters_number}"
dataset_path = "./Sentiment_Dataset/"
chart_from_local = False

# **Initializations**

In [7]:
os.environ["WANDB_MODE"] = "disabled"

start_global_time = timeit.default_timer()

!mv "{dataset_path}.env" ./
random.seed(seed)
np.random.seed(seed)

load_dotenv()

True

# **Functions Definition**

In [8]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

def df_predictions(df, tokenized_df, trainer, label_mapping):
    predictions = trainer.predict(tokenized_df)
    predicted_labels = predictions.predictions.argmax(-1)
    label_mapping = np.load("labels_mapping.npy", allow_pickle=True).item()
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = [reverse_label_mapping[label] for label in predicted_labels]
    df['predicted_sentiment'] = predicted_sentiment
    return df

def month_sort_key(month_dir):
    month_name = os.path.basename(month_dir).split('_')[0]
    try:
        return list(calendar.month_name).index(month_name.capitalize())
    except ValueError:
        return 0

# **Dataset Reading**

In [9]:
irony_df = pd.read_csv("./Sentiment_Dataset/irony_dataset.txt", header=None).rename(columns={0: 'text'})["text"].to_frame()

In [10]:
df = pd.read_csv("./Sentiment_Dataset/training_set.csv", sep = ";")
df = df[["text", "pol"]].dropna()         # select the columns interesting for us and drop rows with NaN

In [11]:
if dataset_limit is not None:
    df = df.sample(dataset_limit).reset_index(drop=True)

In [12]:
print("Numnber of data-points:", len(df))

Numnber of data-points: 3000


In [13]:
df.head()

Unnamed: 0,text,pol
0,Arbasina/Giavazzi sul Corriere a proposito del...,0.0
1,il trading è un amp azienda ecco come risparmi...,1.0
2,"RT TAG mi hanno unfollowato in massa, mortacci...",2.0
3,modalità prezzi e biglietti per la gara al man...,0.0
4,donnastef shop t shirt gonna borsa spedizione ...,0.0


1: 'Positive', 2: 'Negative', 0: 'Neutral'

In [14]:
print("Number of classes", df["pol"].nunique())
print("Types of classes", df["pol"].unique())

Number of classes 3
Types of classes [0. 1. 2.]


# **Dataset Recoding**

In [15]:
df['pol'] = df['pol'].map({1: 'Positive', 2: 'Negative', 0: 'Neutral'})
df = df.rename(columns={'pol': 'label'})

In [16]:
df.head()

Unnamed: 0,text,label
0,Arbasina/Giavazzi sul Corriere a proposito del...,Neutral
1,il trading è un amp azienda ecco come risparmi...,Positive
2,"RT TAG mi hanno unfollowato in massa, mortacci...",Negative
3,modalità prezzi e biglietti per la gara al man...,Neutral
4,donnastef shop t shirt gonna borsa spedizione ...,Neutral


# **Label Conversion**

In [17]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
np.save("labels_mapping.npy", label_mapping)
print("Label mapping:", label_mapping)

Label mapping: {'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


# **Train-Test Split**

In [18]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# **Dataset Conversion for Transformers**

In [19]:
irony_dataset = Dataset.from_pandas(irony_df)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
unique_labels = train_dataset.unique('label')
num_labels = len(unique_labels)
print("Number of Labels withing the Training Set:", len(unique_labels))

Number of Labels withing the Training Set: 3


In [20]:
train_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 2400
})

In [21]:
test_dataset

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 600
})

# **Tokenization and Padding**

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [23]:
tokenized_irony = irony_dataset.map(tokenize_function, batched=True)
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/71 [00:00<?, ? examples/s]

Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

## **Load the LLM Model**

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    torch_dtype=torch.bfloat16,
    device_map="auto"                           # auto: for 1 GPU, and balanced_low_0 for multi-gput
)

`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Definition of QLora**

In [25]:
# === 7. Applica LoRA (fine-tuning efficiente) ===
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(model, peft_config)

 # **Metrics Definition**

In [26]:
accuracy = evaluate.load("accuracy")

In [27]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(-1)
    return accuracy.compute(predictions=preds, references=labels)

# **Training Parameters Definition**

In [28]:
training_args = TrainingArguments(
    output_dir="./llama3_finetuned",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=n_epochs,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    report_to=[]  # Disactivate the W&B
)

# **Training Model Definition**

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(
The model is already on multiple devices. Skipping the move to device specified in `args`.


In [30]:
#irony_df = df_predictions(irony_df, tokenized_irony, trainer, label_mapping)

In [31]:
#irony_df

In [32]:
'''
negative_count = irony_df[irony_df['predicted_sentiment'] == 'Negative'].shape[0]
total_count = irony_df.shape[0]
percentage_negative = (negative_count / total_count) * 100

print(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")
'''

'\nnegative_count = irony_df[irony_df[\'predicted_sentiment\'] == \'Negative\'].shape[0]\ntotal_count = irony_df.shape[0]\npercentage_negative = (negative_count / total_count) * 100\n\nprint(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")\n'

# **Model Training**

In [33]:
start_time = timeit.default_timer()
trainer.train()
print("Elapsed Training Time: ", timeit.default_timer() - start_time)

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 128001}.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9567,0.968241,0.63
2,0.9692,1.268231,0.695
3,0.7636,1.133078,0.715
4,0.6193,1.488602,0.723333
5,1.0271,1.486402,0.718333
6,0.8849,1.629036,0.713333
7,0.581,1.860713,0.701667
8,0.4688,1.922889,0.705
9,0.2089,1.93579,0.698333
10,0.2684,1.955863,0.703333


Elapsed Training Time:  806.4149403800002


# **Model Evaluation**

In [34]:
results = trainer.evaluate()
print(f"\n✅ Accuracy sul test set: {results['eval_accuracy']:.4f}")


✅ Accuracy sul test set: 0.6300


# **Model Testing on Irony Dataset**

In [35]:
irony_df = df_predictions(irony_df, tokenized_irony, trainer, label_mapping)

In [36]:
irony_df

Unnamed: 0,text,predicted_sentiment
0,"Oh, che meraviglia, ancora una volta il treno ...",Positive
1,La mia giornata è stata così fantastica che st...,Positive
2,Sono così grato al mio capo per avermi assegna...,Positive
3,"Wow, sono così felice che il mio vicino abbia ...",Positive
4,Adoro quando il mio computer decide di bloccar...,Negative
...,...,...
66,Finalmente ho trovato il tempo per riposarmi e...,Positive
67,La mia squadra del cuore ha appena pareggiato ...,Positive
68,"Oh, fantastico, ho dimenticato il mio portafog...",Positive
69,Sono così fortunato ad avere una sveglia così ...,Positive


In [37]:
negative_count = irony_df[irony_df['predicted_sentiment'] == 'Negative'].shape[0]
total_count = irony_df.shape[0]
percentage_negative = (negative_count / total_count) * 100

print(f"Percentage of negative sentiments detected in irony_df after training: {percentage_negative:.2f}%")

Percentage of negative sentiments detected in irony_df after training: 18.31%


# **Sentiment Index Creation**

## **Tweets Production Set Reading**

In [38]:
start_time = timeit.default_timer()
all_dataframes = []
directory_path = "./Sentiment_Dataset/2020_index_tweets_set/"
month_directories = [os.path.join(directory_path, d) for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
month_directories.sort(key=month_sort_key) # Sort the directories chronologically

for month_dir in month_directories:
    print("Month File: ", month_dir)
    csv_files = [f for f in os.listdir(month_dir) if f.endswith('.csv')]
    # Sort csv files by date in filename
    csv_files.sort(key=lambda x: x.split('.')[0])
    for file_name in csv_files:
        print("Day File: ", file_name)
        file_path = os.path.join(month_dir, file_name)
        try:
            df = pd.read_csv(file_path, encoding='utf-8', nrows = n_tweets_per_day, sep = ';')
        except UnicodeDecodeError:
            df = pd.read_csv(file_path, encoding='latin1', nrows = n_tweets_per_day, sep = ';')
        all_dataframes.append((file_name.split('.')[0], df["testo"]))
print("Elapsed Tweets Reading time: ", timeit.default_timer() - start_time)

Month File:  ./Sentiment_Dataset/2020_index_tweets_set/jan_tweets_social_mood
Day File:  2020-01-01.csv
Day File:  2020-01-02.csv
Day File:  2020-01-03.csv
Day File:  2020-01-04.csv
Day File:  2020-01-05.csv
Day File:  2020-01-06.csv
Day File:  2020-01-07.csv
Day File:  2020-01-08.csv
Day File:  2020-01-09.csv
Day File:  2020-01-10.csv
Day File:  2020-01-11.csv
Day File:  2020-01-12.csv
Day File:  2020-01-13.csv
Day File:  2020-01-14.csv
Day File:  2020-01-15.csv
Day File:  2020-01-16.csv
Day File:  2020-01-17.csv
Day File:  2020-01-18.csv
Day File:  2020-01-19.csv
Day File:  2020-01-20.csv
Day File:  2020-01-21.csv
Day File:  2020-01-22.csv
Day File:  2020-01-23.csv
Day File:  2020-01-24.csv
Day File:  2020-01-25.csv
Day File:  2020-01-26.csv
Day File:  2020-01-27.csv
Day File:  2020-01-28.csv
Day File:  2020-01-29.csv
Day File:  2020-01-30.csv
Day File:  2020-01-31.csv
Month File:  ./Sentiment_Dataset/2020_index_tweets_set/february_tweets_social_mood
Day File:  2020-02-01.csv
Day Fil

In [39]:
all_dataframes[0]

('2020-01-01',
 0      per la cronaca canova fu uno dei primi monumen...
 1      l opera d arte del giorno antonio canova amore...
 2      rt borghi claudio iniziare l anno con uno che ...
 3      tutta salute approda in seconda serata per ini...
 4      un anno di lavoro se permetti lo auguro io a t...
                              ...                        
 995    ruttosporc i coniglietti che cancellano il twe...
 996    chi vota cazzola vota bonaccini in emilia roma...
 997    cominciamo il con una bella adozione questo ca...
 998    ma si deve ancora assistere a queste vergognos...
 999    il papa niente maiale al pranzo con i poveri c...
 Name: testo, Length: 1000, dtype: object)

## **Sentiment Index Predictions**

In [45]:
start_time = timeit.default_timer()
daily_sentiment_counts = []

i = 0
for date, series in all_dataframes:
    print(f"Computing day {date}")
    df = series.to_frame(name='text')
    dataset = Dataset.from_pandas(df)
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    predictions = trainer.predict(tokenized_dataset)
    predicted_labels = predictions.predictions.argmax(-1)
    label_mapping = np.load("labels_mapping.npy", allow_pickle=True).item()
    reverse_label_mapping = {v: k for k, v in label_mapping.items()}
    predicted_sentiment = [reverse_label_mapping[label] for label in predicted_labels]
    positive_count = predicted_sentiment.count('Positive')
    negative_count = predicted_sentiment.count('Negative')
    neutral_count = predicted_sentiment.count('Neutral')
    daily_sentiment_counts.append({'date': date, 'positive': positive_count, 'negative': negative_count, "sentiment_index": (positive_count-negative_count)/(positive_count+neutral_count+negative_count)})
    i += 1
    if i>=sentiment_index_limit and sentiment_index_limit is not None:
      break

sentiment_counts_df = pd.DataFrame(daily_sentiment_counts)
#display(sentiment_counts_df)
print("Elapsed Sentimet Index Predictions time: ", timeit.default_timer() - start_time)

Computing day 2020-01-01


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Computing day 2020-01-02


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Computing day 2020-01-03


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Computing day 2020-01-04


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
if chart_from_local is False:
  sentiment_counts_df.to_excel("./sentiment_index.xlsx")

## **Plot of the Setiment Index**

In [None]:
if chart_from_local is True:
  sentiment_counts_df = pd.read_excel("./sentiment_index.xlsx")

In [None]:
sentiment_counts_df['rolling_avg'] = sentiment_counts_df['sentiment_index'].rolling(window=moving_average_window).mean()

fig = px.line(sentiment_counts_df, x='date', y='sentiment_index', title='Daily Sentiment Index')
fig.add_scatter(x=sentiment_counts_df['date'], y=sentiment_counts_df['rolling_avg'], mode='lines', name='7-Day Rolling Average', line=dict(color='red'))
fig.update_layout(xaxis_title='Date', yaxis_title='Sentiment Index')
fig.update_xaxes(tickformat="%Y-%m-%d", tickangle=45) # Format date and rotate labels
fig.write_html("./sentiment_index_plot.html")
#fig.write_image("./sentiment_index_plot.jpeg")
fig.show()

In [None]:
if os.path.exists("./Sentiment_Dataset"):
    shutil.rmtree("Sentiment_Dataset")
    print("Removed Sentiment_Dataset directory.")

if os.path.exists("./llama3_finetuned"):
    shutil.rmtree("llama3_finetuned")
    print("Removed llama3_finetuned directory.")

if os.path.exists("./Sentiment_Dataset.zip"):
    os.remove("Sentiment_Dataset.zip")
    print("Removed Sentiment_Dataset.zip file.")



In [None]:
print("Global Program Execution time: ", timeit.default_timer() - start_global_time)