In [1]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from model.encoder_model import EncoderModel
from model.lightning_wrapper import LightningWrapper
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import EarlyStopping
from random import seed
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("clarin-pl/polemo2-official", trust_remote_code=True)
print("train shape", dataset["train"].shape)
print("val shape", dataset["validation"].shape)
print("test shape", dataset["test"].shape)

train shape (6573, 2)
val shape (823, 2)
test shape (820, 2)


In [3]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

# Step 1: Filter out examples where target == 3
dataset = dataset.filter(lambda example: example["target"] != 3)
dataset = dataset.filter(lambda example: example["target"] != 0)

def map_and_filter_targets(example):
    example["target"] = {
        1: 0.0,
        2: 1.0
    }.get(example["target"], example["target"])
    return example

# Step 2: Map the remaining target values
dataset = dataset.map(map_and_filter_targets)

In [4]:
num_class_0 = len(dataset.filter(lambda example: example["target"] == 0)["train"])
num_class_1 = len(dataset.filter(lambda example: example["target"] == 1)["train"])
print("0:", num_class_0)
print("1:", num_class_1)

0: 2469
1: 1824


In [5]:
print("train shape", dataset["train"].shape)
print("val shape", dataset["validation"].shape)
print("test shape", dataset["test"].shape)

train shape (4293, 2)
val shape (540, 2)
test shape (566, 2)


In [6]:
print(dataset["train"]["target"])

[0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 

In [7]:
seq_len = 32

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=seq_len,
        truncation=True,
    )

tokenized_dataset = dataset.map(tokenize_function, batched=False)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "target"])

In [8]:
tokenized_dataset.column_names

{'train': ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
 'validation': ['text',
  'target',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['text', 'target', 'input_ids', 'token_type_ids', 'attention_mask']}

In [9]:
train_dataset = tokenized_dataset["train"]

# Step 2: Count the examples for each class in the train split
num_class_0 = len(train_dataset.filter(lambda example: example["target"] == 0))
num_class_1 = len(train_dataset.filter(lambda example: example["target"] == 1))

# Step 3: Balance the classes
minority_count = min(num_class_0, num_class_1)  # Adjust to the minority count

class_0 = train_dataset.filter(lambda example: example["target"] == 0).shuffle(seed=42)
class_1 = train_dataset.filter(lambda example: example["target"] == 1)

class_0_balanced = class_0.select(range(minority_count))
class_1_balanced = class_1.select(range(minority_count))

# Combine both classes using concatenate_datasets
balanced_train = concatenate_datasets([class_0_balanced, class_1_balanced])

# Step 4: Shuffle the balanced train dataset
balanced_train = balanced_train.shuffle(seed=42)

# Step 5: Replace the train split with the balanced one
tokenized_dataset["train"] = balanced_train

In [10]:
train_dataset = tokenized_dataset["train"]

# Count the number of examples for each class
num_class_0 = len(train_dataset.filter(lambda example: example["target"] == 0))
num_class_1 = len(train_dataset.filter(lambda example: example["target"] == 1))

print("Class 0:", num_class_0)
print("Class 1:", num_class_1)

Class 0: 1824
Class 1: 1824


In [11]:
batch_size = 16

train_loader = DataLoader(tokenized_dataset["train"], batch_size=batch_size, shuffle=True)
val_loader = DataLoader(tokenized_dataset["validation"], batch_size=batch_size)
test_loader = DataLoader(tokenized_dataset["test"], batch_size=batch_size)

In [12]:
sentiment_model = EncoderModel(
    vocab_size=tokenizer.vocab_size,
    seq_dim=seq_len,
    embed_dim=64,
    num_heads=4,
    num_transformer_blocks=3,
    dropout_p = .01
)

lightning_model = LightningWrapper(sentiment_model, lr=1e-3)
logger = TensorBoardLogger("lightning_logs", name="sentiment_model")

early_stopping = EarlyStopping(
    monitor="val_f1",
    mode="max",
    patience=3,
    verbose=True
)

trainer = Trainer(
    max_epochs=20,
    enable_progress_bar=True,
    logger=logger,
    callbacks=[early_stopping],
)

trainer.fit(lightning_model, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs






  | Name           | Type              | Params | Mode 
-------------------------------------------------------------
0 | model          | EncoderModel      | 3.4 M  | train
1 | criterion      | BCEWithLogitsLoss | 0      | train
2 | train_accuracy | BinaryAccuracy    | 0      | train
3 | val_accuracy   | BinaryAccuracy    | 0      | train
4 | test_accuracy  | BinaryAccuracy    | 0      | train
5 | train_f1       | BinaryF1Score     | 0      | train
6 | val_f1         | BinaryF1Score     | 0      | train
7 | test_f1        | BinaryF1Score     | 0      | train
-------------------------------------------------------------
3.4 M     Trainable params
0         Non-trainable params
3.4 M     Total params
13.409    Total estimated model params size (MB)
46        Modules in train mode
0         Modules in eval mode


                                                                           

C:\Users\maciek\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.
C:\Users\maciek\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\LocalCache\local-packages\Python310\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: 100%|██████████| 228/228 [00:14<00:00, 15.50it/s, v_num=0, train_loss_step=0.712, val_loss=0.696, val_acc=0.563, val_f1=0.000, train_loss_epoch=0.694, train_acc=0.500, train_f1=0.00146]

Metric val_f1 improved. New best score: 0.000


Epoch 1: 100%|██████████| 228/228 [00:14<00:00, 15.47it/s, v_num=0, train_loss_step=0.641, val_loss=0.695, val_acc=0.578, val_f1=0.225, train_loss_epoch=0.684, train_acc=0.533, train_f1=0.158]  

Metric val_f1 improved by 0.225 >= min_delta = 0.0. New best score: 0.225


Epoch 2: 100%|██████████| 228/228 [00:14<00:00, 15.67it/s, v_num=0, train_loss_step=0.600, val_loss=0.668, val_acc=0.644, val_f1=0.370, train_loss_epoch=0.647, train_acc=0.655, train_f1=0.532]

Metric val_f1 improved by 0.145 >= min_delta = 0.0. New best score: 0.370


Epoch 3: 100%|██████████| 228/228 [00:14<00:00, 15.27it/s, v_num=0, train_loss_step=0.582, val_loss=0.693, val_acc=0.650, val_f1=0.547, train_loss_epoch=0.620, train_acc=0.726, train_f1=0.651]

Metric val_f1 improved by 0.178 >= min_delta = 0.0. New best score: 0.547


Epoch 4: 100%|██████████| 228/228 [00:14<00:00, 15.50it/s, v_num=0, train_loss_step=0.596, val_loss=0.690, val_acc=0.659, val_f1=0.584, train_loss_epoch=0.598, train_acc=0.783, train_f1=0.742]

Metric val_f1 improved by 0.037 >= min_delta = 0.0. New best score: 0.584


Epoch 5: 100%|██████████| 228/228 [00:14<00:00, 15.67it/s, v_num=0, train_loss_step=0.542, val_loss=0.666, val_acc=0.700, val_f1=0.595, train_loss_epoch=0.579, train_acc=0.823, train_f1=0.790]

Metric val_f1 improved by 0.011 >= min_delta = 0.0. New best score: 0.595


Epoch 6: 100%|██████████| 228/228 [00:14<00:00, 15.74it/s, v_num=0, train_loss_step=0.556, val_loss=0.679, val_acc=0.680, val_f1=0.604, train_loss_epoch=0.561, train_acc=0.866, train_f1=0.843]

Metric val_f1 improved by 0.009 >= min_delta = 0.0. New best score: 0.604


Epoch 8: 100%|██████████| 228/228 [00:14<00:00, 15.75it/s, v_num=0, train_loss_step=0.537, val_loss=0.680, val_acc=0.696, val_f1=0.647, train_loss_epoch=0.553, train_acc=0.890, train_f1=0.878]

Metric val_f1 improved by 0.043 >= min_delta = 0.0. New best score: 0.647


Epoch 11: 100%|██████████| 228/228 [00:14<00:00, 15.67it/s, v_num=0, train_loss_step=0.551, val_loss=0.666, val_acc=0.702, val_f1=0.600, train_loss_epoch=0.536, train_acc=0.924, train_f1=0.914]

Monitored metric val_f1 did not improve in the last 3 records. Best score: 0.647. Signaling Trainer to stop.


Epoch 11: 100%|██████████| 228/228 [00:14<00:00, 15.52it/s, v_num=0, train_loss_step=0.551, val_loss=0.666, val_acc=0.702, val_f1=0.600, train_loss_epoch=0.536, train_acc=0.924, train_f1=0.914]


In [16]:
trainer.test(lightning_model, test_loader)

Testing DataLoader 0: 100%|██████████| 36/36 [00:00<00:00, 69.63it/s]


[{'test_loss': 0.6778662204742432,
  'test_acc': 0.7049469947814941,
  'test_f1': 0.5779510736465454}]

In [37]:
tokens = tokenizer(
    "Hotel był brzydki, ale trochę fajny",
    padding="max_length",
    max_length=seq_len,
    truncation=True,
    return_tensors="pt",
)
sentiment_model.eval()
sentiment_model(tokens)
        

tensor([3.3550e-14], grad_fn=<SqueezeBackward1>)

In [15]:
for name, module in sentiment_model.named_modules():
    print(name, module.training)

 False
token_embedding_layer False
position_embedding_layer False
norm_embed_layer False
transformer_blocks False
transformer_blocks.0 False
transformer_blocks.0.multihead_attn_layer False
transformer_blocks.0.multihead_attn_layer.out_proj False
transformer_blocks.0.norm_attn_layer False
transformer_blocks.0.feed_forward_layer False
transformer_blocks.0.feed_forward_layer.0 False
transformer_blocks.0.feed_forward_layer.1 False
transformer_blocks.0.feed_forward_layer.2 False
transformer_blocks.0.feed_forward_layer.3 False
transformer_blocks.0.norm_feed_layer False
transformer_blocks.1 False
transformer_blocks.1.multihead_attn_layer False
transformer_blocks.1.multihead_attn_layer.out_proj False
transformer_blocks.1.norm_attn_layer False
transformer_blocks.1.feed_forward_layer False
transformer_blocks.1.feed_forward_layer.0 False
transformer_blocks.1.feed_forward_layer.1 False
transformer_blocks.1.feed_forward_layer.2 False
transformer_blocks.1.feed_forward_layer.3 False
transformer_block