In [10]:
import pandas as pd

from tqdm.notebook import tqdm
import torch.nn.functional as F
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import os

In [11]:
DATASETS_FOLDER = "datasets balanceados"
OUTPUT_FOLDER = "datasets com score"

In [12]:
MODEL = f"cardiffnlp/xlm-roberta-base-tweet-sentiment-pt"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [13]:
def polarity_scores(text: str):
    try:
        encoded = tokenizer(text, return_tensors='pt', truncation=True, max_length=512).to(device)
        with torch.no_grad():  # Disable gradient computation for efficiency
            output = model(**encoded)
        scores = output.logits[0].cpu().numpy()
        scores = F.softmax(torch.tensor(scores), dim=0).numpy()  # Apply softmax
        negative, neutral, positive = scores

        # Normalize negative and positive scores
        norm_factor = negative + positive
        normalized_scores = {
            'negative': negative / norm_factor,
            'positive': positive / norm_factor
        }
        return normalized_scores['positive']
    except OverflowError as e:
        print(f"OverflowError: {e} for text: {text}")

In [14]:
polarity_scores("comida boa")  # Test the function

  return forward_call(*args, **kwargs)


np.float32(0.97456425)

In [15]:
def load_datasets(datasets_folder):
    datasets = []
    datasets_full_path = []
    for root, dirs, files in os.walk(datasets_folder):
        for file in files:
            if file.endswith(".csv"):
                datasets.append(file)
                datasets_full_path.append(os.path.join(root, file))
    return datasets, datasets_full_path

In [16]:
datasets, datasets_full_path = load_datasets(DATASETS_FOLDER)
print(f"Found {len(datasets)} datasets:")
print(datasets)

Found 7 datasets:
['bourbon.csv', 'continental.csv', 'foz_plaza.csv', 'nadai.csv', 'taroba.csv', 'viale_cataratas.csv', 'viale_tower.csv']


In [17]:
tqdm.pandas()
for dataset, dataset_full_path in tqdm(zip(datasets, datasets_full_path), total=len(datasets), desc="Processing datasets"):
    print(f"Processing {dataset}")
    df = pd.read_csv(dataset_full_path)
    df['score'] = df['text'].progress_apply(polarity_scores)
    df.to_csv(os.path.join(OUTPUT_FOLDER, dataset), index=False)

Processing datasets:   0%|          | 0/7 [00:00<?, ?it/s]

Processing bourbon.csv


  0%|          | 0/2064 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing continental.csv


  0%|          | 0/510 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing foz_plaza.csv


  0%|          | 0/742 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing nadai.csv


  0%|          | 0/1214 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing taroba.csv


  0%|          | 0/7044 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing viale_cataratas.csv


  0%|          | 0/1312 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)


Processing viale_tower.csv


  0%|          | 0/1994 [00:00<?, ?it/s]

  return forward_call(*args, **kwargs)
