In [2]:
!pip install transformers datasets sacrebleu sentencepiece --quiet

In [3]:
print('hello')

hello


In [4]:
import torch
from datasets import load_dataset
from transformers import (
    MBartForConditionalGeneration,
    MBart50TokenizerFast,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [6]:
dataset = load_dataset("persiannlp/parsinlu_translation_en_fa", cache_dir="./cache")

0000.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

parsinlu-repo/validation/0000.parquet:   0%|          | 0.00/242k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1621665 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/48359 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2137 [00:00<?, ? examples/s]

In [7]:
max_samples = 100000

train_dataset = dataset['train'].shuffle(seed=42).select(range(max_samples))
# validation_dataset = dataset['validation'].shuffle(seed=42).select(range(int(max_samples * 0.1)))
validation_dataset = dataset['validation'].shuffle(seed=42)

In [8]:
def preprocess_function(examples):
    inputs = [f"translate English to Persian: {source}" for source in examples['source']]

    targets = [target[0] if isinstance(target, list) else target for target in examples['targets']]

    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length").input_ids


    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in label_set] for label_set in labels]


    model_inputs["labels"] = labels

    return model_inputs

In [9]:
model_name = 'facebook/mbart-large-50-many-to-many-mmt'

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model = MBartForConditionalGeneration.from_pretrained(model_name, low_cpu_mem_usage=True)

tokenizer.src_lang = 'en_XX'
tokenizer.tgt_lang = 'fa_IR'

model.to(device)

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): MBartScaledWordEmbedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
        

In [10]:
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=["category"])
tokenized_valid = validation_dataset.map(preprocess_function, batched=True, remove_columns=["category"])

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2137 [00:00<?, ? examples/s]

In [11]:
sample_sentence = "This is a test sentence for tokenization."

tokenized_output = tokenizer(sample_sentence)

print("Tokenized Output:", tokenized_output)
print("Token IDs:", tokenized_output['input_ids'])
print("Attention Mask:", tokenized_output['attention_mask'])

tokens = tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'])
print("Tokens:", tokens)

Tokenized Output: {'input_ids': [250004, 3293, 83, 10, 3034, 149357, 100, 47, 1098, 47691, 5, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Token IDs: [250004, 3293, 83, 10, 3034, 149357, 100, 47, 1098, 47691, 5, 2]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['en_XX', '▁This', '▁is', '▁a', '▁test', '▁sentence', '▁for', '▁to', 'ken', 'ization', '.', '</s>']


In [12]:
# Freeze all encoder layers
# for param in model.model.encoder.parameters():
#     param.requires_grad = False

# # Unfreeze the last 5 layers of the encoder
# for param in model.model.encoder.layers[-5:].parameters():
#     param.requires_grad = True

# Now the model's last 5 encoder layers are trainable, and the rest are frozen.

In [13]:
import logging
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

In [14]:
from torch.cuda.amp import autocast, GradScaler

writer = SummaryWriter(log_dir='./logs')

logging.basicConfig(filename='./logs/training.log',
                    filemode='a',
                    format='%(asctime)s - %(message)s', 
                    level=logging.INFO)

logger = logging.getLogger()

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_valid.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

batch_size = 16
train_dataloader = torch.utils.data.DataLoader(tokenized_train, batch_size=batch_size, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(tokenized_valid, batch_size=batch_size)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

accumulation_steps = 2
num_epochs = 2


scaler = GradScaler()


checkpoint_dir = '/kaggle/working/'

for epoch in range(num_epochs):
    logger.info(f"Epoch {epoch + 1}/{num_epochs} started")

    model.train()
    total_train_loss = 0
    optimizer.zero_grad()

    for step, batch in enumerate(tqdm(train_dataloader), start=1):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast():
            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss / accumulation_steps

        scaler.scale(loss).backward()

        if (step + 1) % accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        total_train_loss += loss.item() * accumulation_steps

        if step % 100 == 0:
            logger.info(f"Step {step}: Training loss = {loss.item()}")
            writer.add_scalar('Training Loss', loss.item(), global_step=step + (epoch * len(train_dataloader)))

    avg_train_loss = total_train_loss / len(train_dataloader)
    logger.info(f"Training loss after epoch {epoch + 1}: {avg_train_loss}")
    writer.add_scalar('Average Training Loss', avg_train_loss, global_step=epoch)

    model.eval()
    total_eval_loss = 0
    for batch in valid_dataloader:
        with torch.no_grad():
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            with autocast():
                outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(valid_dataloader)
    logger.info(f"Validation loss after epoch {epoch + 1}: {avg_eval_loss}")
    writer.add_scalar('Validation Loss', avg_eval_loss, global_step=epoch)

    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': avg_train_loss,
        'valid_loss': avg_eval_loss,
    }
    torch.save(checkpoint, checkpoint_dir + f'model_epoch_{epoch + 1}.pth')
    logger.info(f"Checkpoint saved for epoch {epoch + 1}")

final_model_path = checkpoint_dir + 'final_model.pth'
torch.save(model.state_dict(), final_model_path)
logger.info("Final model saved for inference.")
writer.close()

  scaler = GradScaler()
  with autocast():
100%|██████████| 6250/6250 [1:13:00<00:00,  1.43it/s]
  with autocast():
100%|██████████| 6250/6250 [1:12:58<00:00,  1.43it/s]


In [None]:
torch.cuda.empty_cache()

In [15]:
!pip install mega.py

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting mega.py
  Downloading mega.py-1.0.8-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pathlib==1.0.1 (from mega.py)
  Downloading pathlib-1.0.1-py3-none-any.whl.metadata (5.1 kB)
Collecting tenacity<6.0.0,>=5.1.5 (from mega.py)
  Downloading tenacity-5.1.5-py2.py3-none-any.whl.metadata (1.2 kB)
Downloading mega.py-1.0.8-py2.py3-none-any.whl (19 kB)
Downloading pathlib-1.0.1-py3-none-any.whl (14 kB)
Downloading tenacity-5.1.5-py2.py3-none-any.whl (34 kB)
Installing collected packages: pathlib, tenacity, mega.py
  Attempting uninstall: tenacity
    Found existing installation: tenacity 8.3.0
    Uninstalling tenacity-8.3.0:
      Successfully uninstalled tenacity-8.3.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
plotly 5.22.0 requires tenacity>=6.2.0, but you have tenacity 5.1.5 which is incompatible.[0m[31m
[0mSuccessfully installed mega.p

In [19]:
from mega import Mega

mega = Mega()

email = 'abtin.mansouri2003@gmail.com'
password = 'zyszid-4rygro-tImkeq'

m = mega.login(email, password)

In [None]:
file_path = '/kaggle/working/final_model.pth'

file = m.upload(file_path)

link = m.get_upload_link(file)

print(link)

In [52]:
def translate(text, tokenizer, model, source_lang='en_XX', target_lang='fa_IR', device='cpu', max_length=512):
    tokenizer.src_lang = source_lang
    tokenizer.tgt_lang = target_lang
    
    inputs = tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True).to(device)

    with torch.no_grad():
        generated_tokens = model.generate(
            inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=10,
            length_penalty=2.0,
            early_stopping=True
        )

    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    return translation

if __name__ == "__main__":
    model_path = '/kaggle/working/final_model.pth'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    tokenizer, model = load_model_and_tokenizer(model_path)
    model.to(device)

    text = " for what feels like an eternity."
    translation = translate(text, tokenizer, model, source_lang='en_XX', target_lang='fa_IR', device=device)
    print(f"Original Text: {text}")
    print(f"Translated Text: {translation}")


Original Text:  for what feels like an eternity.
Translated Text: براي چيزي كه مثل يک ابديت احساس ميشه
