# Sentiment analysis with BERT

In [1]:
# Google colab, once
# !pip install imblearn
# !wget "https://raw.githubusercontent.com/marciobda/PortugueseEmotionRecognitionWeakSupervision/refs/heads/main/test.csv"

In [2]:
#general purpose
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ast
import logging
import time
from typing import Dict, Optional, List

#data processing
import re, string

from sklearn.model_selection import train_test_split

#transformers
from transformers import LlamaTokenizer, LlamaForCausalLM

# PyTorch
import torch

#metrics
from sklearn.metrics import classification_report, confusion_matrix

#seed for reproducibility
SEED = 42

MODEL_NAME = 'llama2_no_mask'

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-v0_8")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)


2024-11-16 00:11:26.789875: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731726686.801724  104409 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731726686.804993  104409 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-16 00:11:26.820795: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<Figure size 640x480 with 0 Axes>

In [3]:
emotions = ['Admiração', 'Diversão', 'Raiva', 'Aborrecimento', 'Aprovação', 'Confusão', 'Curiosidade', 'Desejo', 'Decepção', 'Nojo', 'Vergonha', 'Entusiasmo', 'Medo', 'Gratidão', 'Luto', 'Alegria', 'Amor', 'Nervosismo', 'Otimismo', 'Orgulho', 'Alívio', 'Remorso', 'Tristeza', 'Surpresa', 'Saudade', 'Inveja', 'Compaixão', 'Desaprovação']

In [4]:
def conf_matrix(y, y_pred, title):
    fig, ax =plt.subplots(figsize=(40,40))
    labels=emotions
    ax=sns.heatmap(confusion_matrix(y, y_pred), annot=True, cmap="Blues", fmt='g', cbar=False, annot_kws={"size":20})
    plt.title(title, fontsize=30)
    ax.xaxis.set_ticklabels(labels, fontsize=20) 
    ax.yaxis.set_ticklabels(labels, fontsize=20)
    ax.set_ylabel('Test', fontsize=22)
    ax.set_xlabel('Predicted', fontsize=22)
    plt.show()


## Loading the data

In [5]:
df_test = pd.read_csv('./../test.csv', sep='\t', quoting=3 , engine='python')
df_test = df_test.dropna()
df_test.head()

Unnamed: 0,tweet_id,tweet,categoria
0,1407769371955634180,nossa sério eu daria tudo p saber das fofocas ...,6
1,1407860353598427138,Sem palavras p agradecer tudo o que tem aconte...,13
2,1407855425782005771,tava respondendo tudo isso na minha cabeça e m...,12
3,1407740654428561414,eu achei que era possível terminar sem pegar n...,5
4,1407865324989521921,Sim mas n o amor romântico,16


### Remove duplicated tweets

In [6]:
df_test.info()
df_test.drop_duplicates(subset='tweet', inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2657 entries, 0 to 2681
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   2657 non-null   int64 
 1   tweet      2657 non-null   object
 2   categoria  2657 non-null   object
dtypes: int64(1), object(2)
memory usage: 83.0+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 2648 entries, 0 to 2681
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   2648 non-null   int64 
 1   tweet      2648 non-null   object
 2   categoria  2648 non-null   object
dtypes: int64(1), object(2)
memory usage: 82.8+ KB


## Tweets analysis

In [7]:
# Remove punctuations, links, mentions and \r\n new line characers
def strip_all_entities(text):
    text = text.replace('\r','').replace('\n',' ').lower()
    text = re.sub(r'(?:\@|https?\://)\S+', '',text)
    text = re.sub(r'[^\x00-\x7f]','', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'

    banned_list = string.punctuation

    table = str.maketrans('','',banned_list)
    text = text.translate(table)

    return text

# Filter special characters such as & and $ present in some words
def filter_chars(text):
    sent = []
    for word in text.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    
    return ' '.join(sent)

# Remove multiple spaces
def remove_mult_spaces(text):
    return re.sub(r'\s\s+', ' ', text)

In [8]:
new_texts_test = []

for text in df_test.tweet:
    new_texts_test.append(remove_mult_spaces(filter_chars(strip_all_entities(text))))

df_test['clean_tweet'] = df_test.tweet

In [9]:
df_test['clean_tweet'].head()

0    nossa sério eu daria tudo p saber das fofocas ...
1    Sem palavras p agradecer tudo o que tem aconte...
2    tava respondendo tudo isso na minha cabeça e m...
3    eu achei que era possível terminar sem pegar n...
4                           Sim mas n o amor romântico
Name: clean_tweet, dtype: object

## Emotion category analysis

In [10]:
df_test['categoria'].value_counts()

categoria
5        320
7        222
1        204
2        193
0        134
12       126
13       117
22       109
16       108
26        88
23        81
18        76
17        73
20        73
6         69
9         67
15        66
11        61
19        60
24        60
21        58
8         57
3         50
10        45
4         44
27        43
25        36
14         7
20,21      1
Name: count, dtype: int64

In [11]:
df_test['categoria'] = df_test['categoria'].apply(ast.literal_eval)
df_test = df_test.explode('categoria')

In [12]:
df_test['categoria'].value_counts()

categoria
5     320
7     222
1     204
2     193
0     134
12    126
13    117
22    109
16    108
26     88
23     81
18     76
20     74
17     73
6      69
9      67
15     66
11     61
19     60
24     60
21     59
8      57
3      50
10     45
4      44
27     43
25     36
14      7
Name: count, dtype: int64

### Train - Validation - Test split

### Llamma modeling

In [19]:
class LlamaTorchInferece:
    def __init__(
        self,
        model_name: str,
        device: str = "cuda" if torch.cuda.is_available() else "cpu",
        load_in_8bit: bool = False,
        torch_dtype: torch.dtype = torch.float16
    ):
        self.device = device
        self.logger = self.__setup_logger()

        self.logger.info(f"Loading model {model_name} on {device}")
        self.logger.info(f"Using dtype: {torch_dtype}")

        self.tokenizer = LlamaTokenizer.from_pretrained(model_name)

        model_kwargs = {
            "device_map": "auto" if device == "cuda" else None,
            "torch_dtype": torch_dtype,
        }

        if load_in_8bit and device == "cuda":
            self.logger.info("Loading model in 8bit precision") 
            model_kwargs["load_in_8bit"] = True

        self.model = LlamaForCausalLM.from_pretrained(
            model_name,
            **model_kwargs,
            local_files_only = True
        )

        if device == "cpu":
            self.model = self.model.to(device)

        self.logger.info("Model loaded successfully")

    def __setup_logger(self) -> logging.Logger:
        logging.basicConfig(level=logging.INFO)
        return logging.getLogger(__name__)
    
    @torch.inference_mode()
    def generate_response(
        self,
        prompt: str,
        max_new_tokens: int = 256,
        temperature: float = 0.7,
        top_p: float = 0.95,
        top_k: int = 50,
        num_return_sequences: int = 1,
        do_sample: bool = True,
        stop_sequences: Optional[list[str]] = None,
    ) -> Dict:
        
        start_time = time.time()

        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        input_length = inputs.input_ids.shape[1]

        outputs = self.model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            top_k=top_k,
            num_return_sequences=num_return_sequences,
            pad_token_id=self.tokenizer.pad_token_id,
            eos_token_id=self.tokenizer.eos_token_id
        )

        generated_text = self.tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

        if stop_sequences:
            for stop_sequence in stop_sequences:
                if stop_sequence in generated_text:
                    generated_text = generated_text[:generated_text.index(stop_sequence)]
        
        generation_time = time.time() - start_time

        memory_usage = None
        if self.device == "cuda":
            memory_usage = {
                "allocated": torch.cuda.memory_allocated() / 1024**2,
                "cached": torch.cuda.memory_reserved() / 1024**2,
            }
        
        return {
            "response": generated_text,
            "generation_time": f"{generation_time:.2f} seconds",
            "memory_usage": memory_usage,
        }
    
    def get_model_info(self) -> Dict:
        return {
        "model_type": self.model.config.model_type,
        "vocab_size": self.model.config.vocab_size,
        "hidden_size": self.model.config.hidden_size,
        "num_attention_heads": self.model.config.num_attention_heads,
        "num_hidden_layers": self.model.config.num_hidden_layers,
        "device": self.device,
    }

In [22]:
MODEL_NAME = 'meta-llama/Llama-2-7b-chat-hf'

llama = LlamaTorchInferece(
    model_name=MODEL_NAME,
    device="cuda" if torch.cuda.is_available() else "cpu",
    load_in_8bit=False,
    torch_dtype=torch.float16
)

print("Model information:")
print(llama.get_model_info())

INFO:__main__:Loading model meta-llama/Llama-2-7b-chat-hf on cuda
INFO:__main__:Using dtype: torch.float16
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Model loaded successfully


Model information:
{'model_type': 'llama', 'vocab_size': 32000, 'hidden_size': 4096, 'num_attention_heads': 32, 'num_hidden_layers': 32, 'device': 'cuda'}


In [None]:
prompt = """Human: What is the capital of France?
Assistant: """

result = llama.generate_response(
    prompt=prompt,
    max_new_tokens=256,
    temperature=0.7,
    stop_sequences=["Human:", "Assistant:"]

)

print("\nGenerated response:", result["response"])
print("Generation Time:", result['generation_time'])
if result['memory_usage']:
    print(f"GPU Memory Usage: {result['memory_usage']['allocated']:.2f} MB allocated")

INFO:__main__:Loading model meta-llama/Llama-2-7b-chat-hf on cuda
INFO:__main__:Using dtype: torch.float16
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

INFO:__main__:Model loaded successfully
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Model information:
{'model_type': 'llama', 'vocab_size': 32000, 'hidden_size': 4096, 'num_attention_heads': 32, 'num_hidden_layers': 32, 'device': 'cuda'}

Generated response: 🇫🇷 The capital of France is Paris! 😊 Would you like to know more about Paris or France in general?
Generation Time: 24.44 seconds
GPU Memory Usage: 9122.71 MB allocated


In [23]:
prompt = """Você fala português?
Assistant: """

result = llama.generate_response(
        prompt=prompt,
        max_new_tokens=256,
        temperature=0.7,
        stop_sequences=["Human:", "Assistant:"]

    )

print("\nGenerated response:", result["response"])
print("Generation Time:", result['generation_time'])
if result['memory_usage']:
    print(f"GPU Memory Usage: {result['memory_usage']['allocated']:.2f} MB allocated")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated response: 😊 I'm just an AI, I don't have a physical location or a specific language, but I can help you with any questions or tasks you may have! How can I assist you today? 😊
Generation Time: 122.39 seconds
GPU Memory Usage: 258.14 MB allocated


In [26]:
prompt = """Considere o conjunto de emoções: Admiração, Diversão, Raiva, Aborrecimento, Aprovação, Confusão, Curiosidade, Desejo, Decepção, Nojo, Vergonha, Entusiasmo, Medo, Gratidão, Luto, Alegria, Amor, Nervosismo, Otimismo, Orgulho, Alívio, Remorso, Tristeza, Surpresa, Saudade, Inveja, Compaixão, Desaprovação. Na frase 'Finalmente dei uma faxina bonita na casa! A sensação de alívio é maravilhosa' qual das emocões você consegue identificar? Responda apenas a emoção mais proeminente.
Assistant: """

result = llama.generate_response(
        prompt=prompt,
        max_new_tokens=256,
        temperature=0.7,
        stop_sequences=["Human:", "Assistant:"]

    )

print("\nGenerated response:", result["response"])
print("Generation Time:", result['generation_time'])
if result['memory_usage']:
    print(f"GPU Memory Usage: {result['memory_usage']['allocated']:.2f} MB allocated")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated response: Of course, I can help you with that! In the given sentence, the most proeminent emotion is Alívio (Relief).
Generation Time: 69.06 seconds
GPU Memory Usage: 258.15 MB allocated


In [None]:
# conf_matrix(y_test.argmax(1), y_pred_bert.argmax(1),f'{MODEL_NAME} Sentiment Analysis\nConfusion Matrix')

In [None]:
# print(f'\tClassification Report for {MODEL_NAME}:\n\n',classification_report(y_test, y_pred_bert, target_names=emotions))

### Testing the model

In [None]:
# text = "Eu estou empolgado com o trabalho"
# predicted_category = predict_category(text, model, tokenizer, ohe)

# print(f"Predicted categoy: {predicted_category}")