# Initilization

In [5]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import torch
import time
import numpy as np

import multiprocessing

In [None]:
# from huggingface_hub import notebook_login
# notebook_login()
# #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

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_cpus = multiprocessing.cpu_count()
num_gpus = torch.cuda.device_count()
optimal_workers = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
print(f'device: {device} CPU count: {num_cpus} GPU count:{num_gpus}  Workers count: {optimal_workers}')

In [8]:
roberta_name = "xlm-roberta-base"
bert_name = "bert-base-multilingual-cased"
pars_name = "HooshvareLab/bert-base-parsbert-uncased"
model_name = bert_name

# Data

## Data Loading

In [9]:
train_df = pd.read_csv('/kaggle/input/taghche/train.csv')
val_df = pd.read_csv('/kaggle/input/taghche/val.csv')
test_df = pd.read_csv('/kaggle/input/taghche/test.csv')

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

## Data Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

max_length = 511

def truncate_and_tokenize(examples):
    truncated_texts = []
    for text in examples['text']:
        if len(text) > max_length:
            truncated_text = text[-max_length:]  # Truncate from the beginning
        else:
            truncated_text = text
        truncated_texts.append(truncated_text)
    return tokenizer(truncated_texts, padding='max_length', truncation=True, max_length=max_length)

In [None]:
train_dataset = train_dataset.map(truncate_and_tokenize, batched=True)
val_dataset = val_dataset.map(truncate_and_tokenize, batched=True)
test_dataset = test_dataset.map(truncate_and_tokenize, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Model

In [12]:
def freeze(model, freeze_layer = [1,2,3,4,5,6]):
    base_name = 'bert.encoder.layer.'
    freeze_name = [base_name+str(layer) for layer in freeze_layer]
    for name, param in model.named_parameters():
        if name in freeze_name:
            param.requires_grad = False

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)
freeze(model)
model.to(device)

# Train

## Setup

In [18]:
batch_size = 16
epochs = 4
logging_steps = 100
save_steps_perc = 0.25
learning_rate = 5e-5

data loading

In [19]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

optimizer and scheduler

In [20]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

## Training Loop

In [None]:
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(range(len(train_dataloader)))
    total_loss = 0
    start_time = time.time()
    for step, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

        if step % logging_steps == 0:
            elapsed_time = time.time() - start_time
            print(f"Epoch {epoch + 1}, Step {step}, Loss: {total_loss / (step + 1)}, Time elapsed: {elapsed_time}s")

        # Save model at checkpoints
#         if step % int(save_steps_perc * len(train_dataloader)) == 0 and step > 0:
#             model.save_pretrained(f'checkpoint-epoch{epoch+1}-step{step}')

    # Save model after each epoch
    model.save_pretrained(f'model-epoch{epoch+1}')

    # Validation
    model.eval()
    val_loss = 0
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in val_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            val_loss += loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == batch['label']).sum().item()
            total_predictions += predictions.size(0)
    val_accuracy = correct_predictions / total_predictions
    print(f"Validation Loss: {val_loss / len(val_dataloader)}, Validation Accuracy: {val_accuracy}")

# Save final model
model.save_pretrained('final_model')

# Testing

In [None]:
model.eval()
test_loss = 0
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
    for batch in test_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast():
            outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], labels=batch['label'])
            loss = outputs.loss
            test_loss += loss.item()
            predictions = outputs.logits.argmax(dim=-1)
            correct_predictions += (predictions == batch['label']).sum().item()
            total_predictions += predictions.size(0)
test_accuracy = correct_predictions / total_predictions
print(f"Test Loss: {test_loss / len(test_dataloader)}, Test Accuracy: {test_accuracy}")