**Temat:** Analiza sentymentu w tekstach internetowych w oparciu o sieci typu Transformer

**Wprowadzenie:** Analiza sentymentu to technika przetwarzania języka naturalnego (NLP), która identyfikuje ton emocjonalny w tekście, klasyfikując go na pozytywny, negatywny lub neutralny. Wykorzystuje się ją do badania opinii klientów, monitorowania reputacji marki czy analizy treści mediów społecznościowych.

**Cel projektu:** Celem projektu jest opracowanie i implementacja modelu analizy sentymentu, który pozwoli na klasyfikację opinii użytkowników na podstawie tekstów pochodzących z Internetu. Należy przeanalizować dane tekstowe, przygotować odpowiedni model oraz zaprezentować wyniki analizy.

In [5]:
%pip install datasets transformers torch langdetect scikit-learn pandas --quiet

### Tokenizacja

In [None]:
from datasets import concatenate_datasets

languages_to_process = ['en', 'es', 'zh']
labels_id = {'negative': 0, 'neutral': 1, 'positive': 2}

# Convert labels to IDs
def convert_labels_to_ids(batch):
    batch['label_id'] = [labels_id[label] for label in batch['label']]
    return batch

# train ds
train_ds_list = [datasets[lang]['train'] for lang in languages_to_process]
# Concatenate datasets for selected languages
train_ds = concatenate_datasets(train_ds_list)
train_ds = train_ds.map(convert_labels_to_ids, batched=True, num_proc=4)
train_ds = train_ds.shuffle(seed=42)

# eval ds
eval_ds_list = [datasets[lang]['validation'] for lang in languages_to_process]
# Concatenate datasets for selected languages
eval_ds = concatenate_datasets(eval_ds_list)
eval_ds = eval_ds.map(convert_labels_to_ids, batched=True, num_proc=4)
eval_ds = eval_ds.shuffle(seed=42)

# test ds
test_ds_list = [datasets[lang]['test'] for lang in languages_to_process]
# Concatenate datasets for selected languages
test_ds = concatenate_datasets(test_ds_list)
test_ds = test_ds.map(convert_labels_to_ids, batched=True, num_proc=4)
test_ds = test_ds.shuffle(seed=42)

In [None]:
print(train_ds[0])

In [6]:
import pandas as pd
from datasets import Dataset

# Wczytaj CSV za pomocą pandas
df = pd.read_csv("train_ds.csv")

# Konwersja do HuggingFace Dataset
train_ds = Dataset.from_pandas(df)

# Wczytaj CSV za pomocą pandas
df = pd.read_csv("eval_ds.csv")

# Konwersja do HuggingFace Dataset
eval_ds = Dataset.from_pandas(df)


In [7]:
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=3)

def tokenize_and_encode(examples):
    return tokenizer(
        examples['text'],
        padding='max_length',
        truncation=True,
        max_length=128,  # BERT max sequence length
        # return_tensors=None,  # Returns PyTorch tensors
        # return_special_tokens_mask=True
    )



tokenized_train_ds = train_ds.map(
    tokenize_and_encode,
    batched=True,
    batch_size=1000,  # Increased batch size
    num_proc=4,       # Use multiple CPU cores
    remove_columns=['text', 'language', 'label']  # Remove original columns we don't need
)
tokenized_train_ds = tokenized_train_ds.rename_column("label_id", "label")
tokenized_eval_ds = eval_ds.map(
    tokenize_and_encode,
    batched=True,
    batch_size=1000,  # Increased batch size
    num_proc=4,       # Use multiple CPU cores
    remove_columns=['text', 'language', 'label']  # Remove original columns we don't need
)
tokenized_eval_ds = tokenized_eval_ds.rename_column("label_id", "label")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map (num_proc=4):   0%|          | 0/15560 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map (num_proc=4):   0%|          | 0/1959 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
print(tokenized_train_ds[0])

In [4]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [8]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'f1': f1
    }

training_args = TrainingArguments(
    output_dir="./multilingual_bert_sentiment",
    overwrite_output_dir=True,
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model("./multilingual_bert_sentiment")
trainer.evaluate()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6419,0.780892,0.663604,0.659149
2,0.4729,0.81668,0.685554,0.67501
3,0.407,0.850559,0.694742,0.692645


{'eval_loss': 0.8505592942237854,
 'eval_accuracy': 0.6947422154160285,
 'eval_f1': 0.6926448209179255,
 'eval_runtime': 14.881,
 'eval_samples_per_second': 131.644,
 'eval_steps_per_second': 16.464,
 'epoch': 3.0}

In [14]:
# Wczytaj CSV za pomocą pandas
df = pd.read_csv("test_ds.csv")

# Konwersja do HuggingFace Dataset
test_ds = Dataset.from_pandas(df)

In [15]:
tokenized_test_ds = test_ds.map(
    tokenize_and_encode,
    batched=True,
    batch_size=1000,  # Increased batch size
    num_proc=4,       # Use multiple CPU cores
    remove_columns=['text', 'language', 'label']  # Remove original columns we don't need
)
tokenized_test_ds = tokenized_test_ds.rename_column("label_id", "label")

Map (num_proc=4):   0%|          | 0/1942 [00:00<?, ? examples/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [20]:
import torch

acc = 0

def process(example):
    global acc

    # Tokenizacja
    tokens = tokenizer(
        example['text'],
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

    # Przenieś dane na to samo urządzenie co model
    device = model.device
    tokens = {k: v.to(device) for k, v in tokens.items()}

    # Predykcja
    with torch.no_grad():
        outputs = model(**tokens)
    pred_class = torch.argmax(outputs.logits, dim=1).item()

    if pred_class == example['label_id']:
        acc += 1

    return example  # wymagane przez .map

# Upewnij się, że model też jest na tym samym urządzeniu
model.to("cuda" if torch.cuda.is_available() else "cpu")
test_ds.map(process)

# Liczenie accuracy
acc = acc / len(test_ds)
print(f"Accuracy: {acc:.4f}")


Map:   0%|          | 0/1942 [00:00<?, ? examples/s]

Accuracy: 0.6905


In [23]:
model.save_pretrained("./sentiment-bert-multilingual")
tokenizer.save_pretrained("./sentiment-bert-multilingual")

('./sentiment-bert-multilingual/tokenizer_config.json',
 './sentiment-bert-multilingual/special_tokens_map.json',
 './sentiment-bert-multilingual/vocab.txt',
 './sentiment-bert-multilingual/added_tokens.json')

In [24]:
!zip -r sentiment-bert-multilingual.zip /content/sentiment-bert-multilingual

  adding: content/sentiment-bert-multilingual/ (stored 0%)
  adding: content/sentiment-bert-multilingual/special_tokens_map.json (deflated 42%)
  adding: content/sentiment-bert-multilingual/tokenizer_config.json (deflated 75%)
  adding: content/sentiment-bert-multilingual/model.safetensors (deflated 7%)
  adding: content/sentiment-bert-multilingual/config.json (deflated 55%)
  adding: content/sentiment-bert-multilingual/vocab.txt (deflated 45%)


In [25]:
from google.colab import files
files.download("sentiment-bert-multilingual.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>