# Training an Urdu Language Model using ALBERT Architecture

In this notebook, we'll explore the process of training a language model for the Urdu language using the ALBERT architecture. We will use the Hugging Face Transformers library to perform tokenization, model configuration, and training.

## Imports and Setup

In [None]:
# Pakcages
!pip install transformers sentencepiece datasets

# libraries
import sentencepiece as sp
from datasets import load_dataset
from transformers import (
    AlbertForMaskedLM,
    AlbertConfig,
    AlbertTokenizer,
    DataCollatorForLanguageModeling,
    LineByLineTextDataset,
    Trainer,
    TrainingArguments
)

import os

import huggingface_hub 
huggingface_hub.login("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")
from huggingface_hub import HfFolder

import wandb
wandb.login(key="d920e57c9f860eba9eba5bc0a71b6a5aa91761b4")

## Downloading and Preprocessing Dataset

In [None]:
dataset = load_dataset("anuragshas/ur_opus100_processed")

train_dataset = dataset['train']
validation_dataset = dataset['train'].train_test_split(test_size=0.1)['test']

In [None]:
train_data = train_dataset['text']
validation_data = validation_dataset['text']

In [None]:
with open('ur.txt', 'w', encoding='utf-8') as f:
    for item in train_data:
        f.write(item + '\n')
        
with open('val_ur.txt', 'w', encoding='utf-8') as f:
    for item in validation_data:
        f.write(item + '\n')

## Tokenization 
Using SentencePiece to create a custom tokenizer for our Urdu dataset.

In [None]:
sp.SentencePieceTrainer.train(input="/kaggle/working/ur.txt",model_prefix='spiece', vocab_size=23319)

In [None]:
sp.SentencePieceTrainer.train(input="/kaggle/working/val_ur.txt", model_prefix='spiece_val', vocab_size=14354)

In [None]:
import os
os.mkdir('Urdu_Model')
os.rename('spiece.model','Urdu_Model/spiece.model')
os.rename('spiece.vocab','Urdu_Model/spiece.vocab')

In [None]:
urdu_tokenizer = AlbertTokenizer.from_pretrained('Urdu_Model')

urdu_tokenizer.save_pretrained('Urdu_Model')

## Model Initialization and Config

In [None]:
config=AlbertConfig.from_pretrained('albert-large-v2')

config.save_pretrained('Urdu_Model')

In [None]:
urdu_model = AlbertForMaskedLM(config=config)

## DataLoader and Data Collator

In [None]:
train_line_by_line_dataset = LineByLineTextDataset(
    tokenizer=urdu_tokenizer,
    file_path="/kaggle/working/ur.txt",
    block_size=256, #block_size means number of tokens in a sequence.
)

validation_line_by_line_dataset = LineByLineTextDataset(
    tokenizer=urdu_tokenizer,
    file_path="/kaggle/working/val_ur.txt",
    block_size=256,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=urdu_tokenizer,mlm=True, mlm_probability=0.15)

## Training Config

In [None]:
repository_id = "mwz/UrduALBERT"

In [None]:
training_args = TrainingArguments(
    output_dir=repository_id,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,  
    evaluation_strategy="steps",  # Evaluate at every logging_steps
    eval_steps=500,  # Evaluate every 200 steps
    logging_dir=f"{repository_id}/logs",
    logging_strategy="steps",
    logging_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=1000,
    save_strategy="epoch",
    save_total_limit=2,
    report_to="tensorboard",
    push_to_hub=True,
    hub_strategy="every_save",
    hub_model_id=repository_id,
    hub_token=HfFolder.get_token(),
)

## Training

In [None]:
trainer = Trainer(
    model=urdu_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_line_by_line_dataset,
    eval_dataset=validation_line_by_line_dataset,  
)

trainer.train()

### Pushing it to [Hub](https://huggingface.co/mwz/UrduALBERT)

In [None]:
trainer.create_model_card()
trainer.push_to_hub()