# Initilization

In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch
import time
import numpy as np

import multiprocessing

In [2]:
# from huggingface_hub import notebook_login
# notebook_login()
# #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

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_cpus = multiprocessing.cpu_count()
num_gpus = torch.cuda.device_count()
optimal_workers = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
print(f'device: {device} CPU count: {num_cpus} GPU count:{num_gpus}  Workers count: {optimal_workers}')

device: cuda CPU count: 4 GPU count:2  Workers count: 4


In [4]:
roberta_name = "xlm-roberta-base"
bert_name = "bert-base-multilingual-cased"
pars_bert  = "HooshvareLab/bert-base-parsbert-uncased"
model_name = pars_bert

# Data

## Data Loading

In [5]:
train_df = pd.read_csv('/kaggle/input/taghche5-1/train.csv')
val_df = pd.read_csv('/kaggle/input/taghche5-1/val.csv')
test_df = pd.read_csv('/kaggle/input/taghche5-1/test.csv')

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Data Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

max_length = 511

def truncate_and_tokenize(examples):
    truncated_texts = []
    for text in examples['text']:
        if len(text) > max_length:
            truncated_text = text[-max_length:]  # Truncate from the beginning
        else:
            truncated_text = text
        truncated_texts.append(truncated_text)
    return tokenizer(truncated_texts, padding='max_length', truncation=True, max_length=max_length)



config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

In [7]:
train_dataset = train_dataset.map(truncate_and_tokenize, batched=True)
val_dataset = val_dataset.map(truncate_and_tokenize, batched=True)
test_dataset = test_dataset.map(truncate_and_tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/24177 [00:00<?, ? examples/s]

Map:   0%|          | 0/3022 [00:00<?, ? examples/s]

Map:   0%|          | 0/3023 [00:00<?, ? examples/s]

# Model

In [8]:
# def freeze(model, freeze_layer = [1,2,3,4,5,6]):
#     base_name = 'bert.encoder.layer.'
#     freeze_name = [base_name+str(layer) for layer in freeze_layer]
#     for name, param in model.named_parameters():
#         if name in freeze_name:
#             param.requires_grad = False

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
# freeze(model)
model.to(device)

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at HooshvareLab/bert-base-parsbert-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(100000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

# Train

## Setup

In [10]:
batch_size = 16
epochs = 5
logging_steps = 100
save_steps_perc = 0.25
learning_rate = 5e-5

data loading

In [11]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer and scheduler

In [12]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)



## Training Loop

In [13]:
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(range(len(train_dataloader)))
    total_loss = 0
    start_time = time.time()
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        if step % logging_steps == 0:
            elapsed_time = time.time() - start_time
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {total_loss / (step + 1)}, Time elapsed: {elapsed_time}s")

        # Save model at checkpoints
#         if step % int(save_steps_perc * len(train_dataloader)) == 0 and step > 0:
#             model.save_pretrained(f'checkpoint-epoch{epoch+1}-step{step}')

    # Save model after each epoch
    model.save_pretrained(f'model-epoch{epoch+1}')

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            val_loss += loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == batch['label']).sum().item()
            total_predictions += predictions.size(0)
    val_accuracy = correct_predictions / total_predictions
    print(f"Validation Loss: {val_loss / len(val_dataloader)}, Validation Accuracy: {val_accuracy}")

# Save final model
model.save_pretrained('final_model')

  0%|          | 0/1512 [00:00<?, ?it/s]

Epoch 1, Step 0, Loss: 1.1853638887405396, Time elapsed: 1.881455898284912s
Epoch 1, Step 100, Loss: 0.9744102724707953, Time elapsed: 149.50642323493958s
Epoch 1, Step 200, Loss: 0.9463496774583313, Time elapsed: 301.6115825176239s
Epoch 1, Step 300, Loss: 0.9176276824205025, Time elapsed: 453.6811854839325s
Epoch 1, Step 400, Loss: 0.9033391772957514, Time elapsed: 605.5723161697388s
Epoch 1, Step 500, Loss: 0.8887903521398822, Time elapsed: 757.5965549945831s
Epoch 1, Step 600, Loss: 0.8798926728537396, Time elapsed: 909.6063575744629s
Epoch 1, Step 700, Loss: 0.8737546200844089, Time elapsed: 1061.7793610095978s
Epoch 1, Step 800, Loss: 0.8650559948922394, Time elapsed: 1213.9115104675293s
Epoch 1, Step 900, Loss: 0.8611969533030122, Time elapsed: 1365.9815592765808s
Epoch 1, Step 1000, Loss: 0.8521491503501153, Time elapsed: 1518.013792514801s
Epoch 1, Step 1100, Loss: 0.8469088039920072, Time elapsed: 1670.0845091342926s
Epoch 1, Step 1200, Loss: 0.8402772699664971, Time elapsed:

  0%|          | 0/1512 [00:00<?, ?it/s]

Epoch 2, Step 0, Loss: 0.4166103005409241, Time elapsed: 1.509018898010254s
Epoch 2, Step 100, Loss: 0.5434559940701664, Time elapsed: 153.5653429031372s
Epoch 2, Step 200, Loss: 0.539132923999829, Time elapsed: 305.5818374156952s
Epoch 2, Step 300, Loss: 0.5448346668204596, Time elapsed: 457.7375502586365s
Epoch 2, Step 400, Loss: 0.550174992912428, Time elapsed: 609.7947974205017s
Epoch 2, Step 500, Loss: 0.5524498683041679, Time elapsed: 761.6352255344391s
Epoch 2, Step 600, Loss: 0.5521537426133918, Time elapsed: 913.6674420833588s
Epoch 2, Step 700, Loss: 0.5501981111570193, Time elapsed: 1065.5277814865112s
Epoch 2, Step 800, Loss: 0.5479251732596297, Time elapsed: 1217.4557683467865s
Epoch 2, Step 900, Loss: 0.5489338846684958, Time elapsed: 1369.4279174804688s
Epoch 2, Step 1000, Loss: 0.5478365097146529, Time elapsed: 1521.2990458011627s
Epoch 2, Step 1100, Loss: 0.545974379670988, Time elapsed: 1673.225212097168s
Epoch 2, Step 1200, Loss: 0.5473004518189994, Time elapsed: 182

  0%|          | 0/1512 [00:00<?, ?it/s]

Epoch 3, Step 0, Loss: 0.2047414779663086, Time elapsed: 1.5047500133514404s
Epoch 3, Step 100, Loss: 0.3135318465825945, Time elapsed: 153.23969435691833s
Epoch 3, Step 200, Loss: 0.3074536438734822, Time elapsed: 305.33949851989746s
Epoch 3, Step 300, Loss: 0.3120837196695349, Time elapsed: 457.4166853427887s
Epoch 3, Step 400, Loss: 0.3135397174532648, Time elapsed: 609.4349083900452s
Epoch 3, Step 500, Loss: 0.3051797678199654, Time elapsed: 761.4036045074463s
Epoch 3, Step 600, Loss: 0.30909332404604273, Time elapsed: 913.5021042823792s
Epoch 3, Step 700, Loss: 0.3032824690460351, Time elapsed: 1065.4364037513733s
Epoch 3, Step 800, Loss: 0.3054786461722873, Time elapsed: 1217.5156679153442s
Epoch 3, Step 900, Loss: 0.3034862200279909, Time elapsed: 1369.4119563102722s
Epoch 3, Step 1000, Loss: 0.3027011516851026, Time elapsed: 1521.6512577533722s
Epoch 3, Step 1100, Loss: 0.30454663935368653, Time elapsed: 1673.5545728206635s
Epoch 3, Step 1200, Loss: 0.30401511311835044, Time el

  0%|          | 0/1512 [00:00<?, ?it/s]

Epoch 4, Step 0, Loss: 0.30755457282066345, Time elapsed: 1.5118045806884766s
Epoch 4, Step 100, Loss: 0.16687722552088227, Time elapsed: 153.3324146270752s
Epoch 4, Step 200, Loss: 0.1740215540868207, Time elapsed: 305.2374963760376s
Epoch 4, Step 300, Loss: 0.1710775881956186, Time elapsed: 457.2559998035431s
Epoch 4, Step 400, Loss: 0.17755230610488804, Time elapsed: 609.1530914306641s
Epoch 4, Step 500, Loss: 0.1749596530822535, Time elapsed: 761.2998158931732s
Epoch 4, Step 600, Loss: 0.1748064287338474, Time elapsed: 913.392391204834s
Epoch 4, Step 700, Loss: 0.17500556658783373, Time elapsed: 1065.3070340156555s
Epoch 4, Step 800, Loss: 0.17627199269370752, Time elapsed: 1217.2051086425781s
Epoch 4, Step 900, Loss: 0.17594988491038868, Time elapsed: 1369.357264995575s
Epoch 4, Step 1000, Loss: 0.17266714386502421, Time elapsed: 1521.256141424179s
Epoch 4, Step 1100, Loss: 0.17433268432467364, Time elapsed: 1673.2305731773376s
Epoch 4, Step 1200, Loss: 0.17426571635285906, Time e

  0%|          | 0/1512 [00:00<?, ?it/s]

Epoch 5, Step 0, Loss: 0.1973457932472229, Time elapsed: 1.502058506011963s
Epoch 5, Step 100, Loss: 0.10306693712655123, Time elapsed: 153.39920735359192s
Epoch 5, Step 200, Loss: 0.11620940614505601, Time elapsed: 305.42307901382446s
Epoch 5, Step 300, Loss: 0.11484113315729876, Time elapsed: 457.5634870529175s
Epoch 5, Step 400, Loss: 0.11606254077172766, Time elapsed: 609.6333525180817s
Epoch 5, Step 500, Loss: 0.11506120879630634, Time elapsed: 761.7331352233887s
Epoch 5, Step 600, Loss: 0.11473677561214711, Time elapsed: 913.6837117671967s
Epoch 5, Step 700, Loss: 0.11288173845288252, Time elapsed: 1065.79323720932s
Epoch 5, Step 800, Loss: 0.1128658884117431, Time elapsed: 1217.8306653499603s
Epoch 5, Step 900, Loss: 0.11499260495105174, Time elapsed: 1369.8120226860046s
Epoch 5, Step 1000, Loss: 0.11375178289346184, Time elapsed: 1521.8579206466675s
Epoch 5, Step 1100, Loss: 0.11207143414607415, Time elapsed: 1673.7943880558014s
Epoch 5, Step 1200, Loss: 0.11373362678093843, Ti

# Testing

In [14]:
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
prediction_list = []
ground_truth = []
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            test_loss += loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            prediction_list.append(predictions)
            ground_truth.append(batch['label'])
            correct_predictions += (predictions == batch['label']).sum().item()
            total_predictions += predictions.size(0)
test_accuracy = correct_predictions / total_predictions
print(f"Test Loss: {test_loss / len(test_dataloader)}, Test Accuracy: {test_accuracy}")

Test Loss: 0.8508582648146089, Test Accuracy: 0.800198478332782


In [15]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

# Custom compute metrics function
def compute_metrics(true_labels, pred_labels):
    metrics = {
        'f1': f1_score(true_labels, pred_labels, average='macro'),
        'accuracy': accuracy_score(true_labels, pred_labels),
        'precision': precision_score(true_labels, pred_labels, average='macro'),
        'recall': recall_score(true_labels, pred_labels, average='macro'),
        'confusion_matrix': confusion_matrix(true_labels, pred_labels)
    }

    return metrics
metrics = compute_metrics(np.concatenate([tensor.flatten().cpu() for tensor in ground_truth]),
                          np.concatenate([tensor.flatten().cpu() for tensor in prediction_list]))

# Print the computed metrics
print(f"F1 Score: {metrics['f1']}")
print(f"Accuracy: {metrics['accuracy']}")
print(f"Precision: {metrics['precision']}")
print(f"Recall: {metrics['recall']}")
print(f"Confusion Matrix:\n{metrics['confusion_matrix']}")

F1 Score: 0.7998780909248557
Accuracy: 0.800198478332782
Precision: 0.8010233476512046
Recall: 0.8001073787190052
Confusion Matrix:
[[831 128  90]
 [ 82 756 116]
 [ 57 131 832]]
