**Temat:** Analiza sentymentu w tekstach internetowych w oparciu o sieci typu Transformer

**Wprowadzenie:** Analiza sentymentu to technika przetwarzania języka naturalnego (NLP), która identyfikuje ton emocjonalny w tekście, klasyfikując go na pozytywny, negatywny lub neutralny. Wykorzystuje się ją do badania opinii klientów, monitorowania reputacji marki czy analizy treści mediów społecznościowych.

**Cel projektu:** Celem projektu jest opracowanie i implementacja modelu analizy sentymentu, który pozwoli na klasyfikację opinii użytkowników na podstawie tekstów pochodzących z Internetu. Należy przeanalizować dane tekstowe, przygotować odpowiedni model oraz zaprezentować wyniki analizy.

In [35]:
!pip3 install datasets transformers torch 'numpy<2' accelerate peft --quiet

### Ładowanie danych

In [3]:
from datasets import load_dataset

ds = load_dataset("clapAI/MultiLingualSentiment")

In [4]:
# what languages are available
languages = ds['train'].unique('language')
print("Available languages:", languages)
ds_types = ['train', 'validation', 'test']
# Create dictionary to store datasets for each language
datasets_by_language = {}

# # Split train, validation and test for each language
for lang in languages:
    datasets_by_language[lang] = {}
    for ds_type in ds_types:
        datasets_by_language[lang][ds_type] = ds[ds_type].filter(
            lambda batch: [x == lang for x in batch['language']],
            batched = True,
            num_proc=4
            )
        
        # Reduce dataset by 100 times
        rows_counter = datasets_by_language[lang][ds_type].num_rows
        new_num_rows = round(rows_counter*0.01)
        datasets_by_language[lang][ds_type] = datasets_by_language[lang][ds_type].shuffle(seed=42)
        datasets_by_language[lang][ds_type] = datasets_by_language[lang][ds_type].select(range(new_num_rows))
    

Available languages: ['en', 'es', 'ja', 'ar', 'tr', 'fr', 'vi', 'zh', 'de', 'ru', 'ko', 'id', 'multilingual', 'pt', 'ms', 'hi', 'it']


### Tworzenie testowego datasetu

In [5]:
from datasets import concatenate_datasets

test_languages = ['en', 'es', 'zh']
test_ds_list = [datasets_by_language[lang]['test'] for lang in test_languages]
test_dataset = concatenate_datasets(test_ds_list)

## Zero-shot Prompting

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


model_name = 'Qwen/Qwen3-0.6B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
# For MacBooks with CPU Intel you have to set device_map as cpu and torch_dtype as torch.float32 
# otherwise it doesn't compile
zero_shot_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float32,
    device_map={"": "cpu"})
zero_shot_model.eval()

  from .autonotebook import tqdm as notebook_tqdm


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layernorm): Qwe

In [6]:
# Prompt template
def build_prompt(text):
    return f"Just define in one word the sentiment of this text as positive, negative or neutral:\n\"{text}\"\nAnswer(positive/negative/neutral):\n"

def predict_sentiment(text):
    prompt = build_prompt(text)
    zero_shot_model_inputs = tokenizer(prompt, return_tensors="pt").to(zero_shot_model.device)
    generated_ids = zero_shot_model.generate(**zero_shot_model_inputs, max_new_tokens=3)
    answer = tokenizer.batch_decode(generated_ids)[0][len(prompt):]
    return answer

In [7]:
print(test_dataset[0]['text'])
print(predict_sentiment(test_dataset[0]['text']))
print(f'\nReal answer:\n{test_dataset[0]['label']}')

Biden turns to Wall Street for campaign funds
Answer:
positive

Real answer:
neutral


### Test Zero-shot prompting

In [8]:

# Mapping dataset and get estimates from LLM
def process(result, row):
    text = row['text']
    answer = predict_sentiment(text)
    if row['label'] in answer:
        result[row['language']] += 1

# Calculate total and for each language accuracy
def calc_accuracy(result, ds):
    accuracy = {}
    for lang in result:
        lang_total = ds.filter(
            lambda batch: [x == lang for x in batch['language']],
            batched = True,
            num_proc = 4
        ).num_rows
        accuracy[lang] = result[lang]/lang_total
    total_accuracy = sum(result.values()) / ds.num_rows
    accuracy['total'] = total_accuracy
    return accuracy


In [8]:
zero_shot_results = {lang:0 for lang in test_languages}
test_dataset.map(lambda row: process(zero_shot_results, row))
zero_shot_accuracy = calc_accuracy(zero_shot_results, test_dataset)

Map: 100%|██████████| 2186/2186 [1:30:10<00:00,  2.48s/ examples]     
Filter (num_proc=4): 100%|██████████| 2186/2186 [00:00<00:00, 5776.67 examples/s]
Filter (num_proc=4): 100%|██████████| 2186/2186 [00:00<00:00, 6192.13 examples/s]
Filter (num_proc=4): 100%|██████████| 2186/2186 [00:00<00:00, 6291.48 examples/s]


Dokładność dla zero-shot prompting

In [None]:
print(f"{'Język':<10} {'Dokładność':<10}")
print('-' * 22)
for lang, acc in zero_shot_accuracy.items():
    print(f"{lang:<10} {acc:<10.2f}")


Język      Dokładność
----------------------
en         0.47      
es         0.56      
zh         0.45      
total      0.48      


### LoRA

In [20]:
train_languages = ['en', 'es', 'zh']
train_ds_list = [datasets_by_language[lang]['train'] for lang in train_languages]
train_dataset = concatenate_datasets(train_ds_list)
eval_ds_list = [datasets_by_language[lang]['validation'] for lang in train_languages]
eval_dataset = concatenate_datasets(eval_ds_list)

In [19]:
def prepare_prompt(row):
    text = row['text']
    label = row['label']
    return f"<|user|>What is the sentiment of the following sentence?\n\n\"{text}\"\n<|assistant|>{label}"

In [40]:
def lora_process(row):
    prompt = prepare_prompt(row)
    tokenized = tokenizer(
        prompt,
        truncation = True,
        padding = 'max_length',
        max_length = 128
    )
    tokenized['labels'] = tokenized['input_ids'].copy()
    return tokenized

tokenized_train_ds = train_dataset.map(lora_process)
tokenized_train_ds = tokenized_train_ds.remove_columns(set(tokenized_train_ds.column_names) - {"input_ids", "attention_mask", "labels"})
tokenized_train_num_rows = round(tokenized_train_ds.num_rows * 0.1)
tokenized_train_ds = tokenized_train_ds.select(range(tokenized_train_num_rows))
tokenized_eval_ds = eval_dataset.map(lora_process)
tokenized_eval_ds = tokenized_eval_ds.remove_columns(set(tokenized_eval_ds.column_names) - {"input_ids", "attention_mask", "labels"})
tokenized_eval_num_rows = round(tokenized_eval_ds.num_rows * 0.1)
tokenized_eval_ds = tokenized_eval_ds.select(range(tokenized_eval_num_rows))

Map: 100%|██████████| 2187/2187 [00:01<00:00, 1245.08 examples/s]


In [41]:
print(tokenized_train_ds[0])
print(tokenized_train_ds.num_rows)

{'input_ids': [27, 91, 872, 91, 29, 3838, 374, 279, 25975, 315, 279, 2701, 11652, 1939, 29133, 127368, 128848, 296, 31580, 71526, 472, 2819, 72, 8339, 15248, 12376, 3777, 15643, 40890, 131477, 10040, 27, 91, 77091, 91, 29, 59568, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1

In [17]:
# Checking target_modules in qwen
proj_dict = {
    "q_proj": False,
    "v_proj": False
}
q_proj = 'q_proj'
v_proj = 'v_proj'
for name, module in zero_shot_model.named_modules():
    if q_proj in name.lower() and not proj_dict[q_proj]:
        proj_dict[q_proj] = True
    if v_proj in name.lower() and not proj_dict[v_proj]:
        proj_dict[v_proj] = True

for key, value in proj_dict.items():
    print(f'{key}: {value}')

q_proj: True
v_proj: True


In [18]:
from peft import get_peft_model, LoraConfig, TaskType

proj_list = list(proj_dict.keys())

lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = proj_list,
    lora_dropout = 0.05,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM
)

lora_model = get_peft_model(zero_shot_model, lora_config)

In [42]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./qwen-sentiment-lora",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    weight_decay=0.01,
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=False,
    fp16=False,
    bf16=False,
    report_to="none",
    remove_unused_columns=False,
    push_to_hub=False,
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm = False
)

trainer = Trainer(
    model = lora_model,
    args = training_args,
    train_dataset = tokenized_train_ds,
    eval_dataset = tokenized_eval_ds,
    tokenizer = tokenizer,
    data_collator = data_collator
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,2.8338,2.919358
2,2.8077,2.898644
3,2.7707,2.896948


TrainOutput(global_step=657, training_loss=2.8345217741061925, metrics={'train_runtime': 10149.9392, 'train_samples_per_second': 0.517, 'train_steps_per_second': 0.065, 'total_flos': 1780589002752000.0, 'train_loss': 2.8345217741061925, 'epoch': 3.0})

In [43]:
# Prompt template
def build_prompt(text):
    return f"What is the sentiment of the following sentence?\n\n\"{text}\"\n"

def predict_sentiment(text):
    prompt = build_prompt(text)
    lora_model_inputs = tokenizer(prompt, return_tensors="pt").to(lora_model.device)
    generated_ids = lora_model.generate(**lora_model_inputs, max_new_tokens=10)
    answer = tokenizer.batch_decode(generated_ids)[0][len(prompt):]
    return answer

In [44]:
# Mapping dataset and get estimates from LLM
def process(result, row):
    text = row['text']
    answer = predict_sentiment(text)
    if row['label'] in answer:
        result[row['language']] += 1

# Calculate total and for each language accuracy
def calc_accuracy(result, ds):
    accuracy = {}
    for lang in result:
        lang_total = ds.filter(
            lambda batch: [x == lang for x in batch['language']],
            batched = True,
            num_proc = 4
        ).num_rows
        accuracy[lang] = result[lang]/lang_total
    total_accuracy = sum(result.values()) / ds.num_rows
    accuracy['total'] = total_accuracy
    return accuracy

In [47]:
lora_model.to('cpu')
lora_results = {lang:0 for lang in test_languages}
test_dataset.map(lambda row: process(lora_results, row))
lora_accuracy = calc_accuracy(lora_results, test_dataset)

Map: 100%|██████████| 2186/2186 [1:19:15<00:00,  2.18s/ examples]


In [49]:
print(f"{'Język':<10} {'Dokładność':<10}")
print('-' * 22)
for lang, acc in lora_accuracy.items():
    print(f"{lang:<10} {acc:<10.2f}")


Język      Dokładność
----------------------
en         0.20      
es         0.12      
zh         0.20      
total      0.19      
