<a href="https://colab.research.google.com/github/mwzkhalil/paraphrase-bert-ur/blob/main/para_bert_ur.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
csv_path = '/content/drive/MyDrive/ur_paraphrase_30k.csv'
df = pd.read_csv(csv_path)

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
class UrduParaphraseDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        text = self.data.iloc[index]['sentence1']
        paraphrase = self.data.iloc[index]['sentence2']

        encoded = self.tokenizer.encode_plus(
            text,
            paraphrase,
            add_special_tokens=True,
            padding='max_length',
            max_length=self.max_length,
            truncation='only_first',  # Update truncation strategy
            return_tensors='pt'
        )

        input_ids = encoded['input_ids'].squeeze()
        attention_mask = encoded['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForMaskedLM.from_pretrained('bert-base-multilingual-cased')

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [None]:
epochs = 5
batch_size = 16
learning_rate = 2e-5

In [None]:
train_dataset = UrduParaphraseDataset(train_df, tokenizer, max_length=128)
val_dataset = UrduParaphraseDataset(val_df, tokenizer, max_length=128)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
model.train()

for epoch in range(epochs):
  
    running_loss = 0.0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits.view(-1, logits.shape[-1]), input_ids.view(-1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    epoch_loss = running_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f}')

Epoch 1/5 - Loss: 0.0574
Epoch 2/5 - Loss: 0.0001
Epoch 3/5 - Loss: 0.0000
Epoch 4/5 - Loss: 0.0000
Epoch 5/5 - Loss: 0.0000


In [None]:
model.eval()

val_loss = 0.0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        loss = criterion(logits.view(-1, logits.shape[-1]), input_ids.view(-1))
        val_loss += loss.item()

val_loss /= len(val_loader)
print(f'Validation Loss: {val_loss:.4f}')

Validation Loss: 0.0000


In [None]:
input_text = "تصوراتی طور پر کریم سکمنگ کی دو بنیادی جہتیں ہیں - مصنوعات اور جغرافیہ۔"

In [None]:
tokenized_input = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True,
    padding='max_length',
    max_length=128,
    truncation='only_first',
    return_tensors='pt'
)
input_ids = tokenized_input['input_ids'].to(device)
attention_mask = tokenized_input['attention_mask'].to(device)

In [None]:
model.eval()
with torch.no_grad():
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,  # Adjust as needed
        num_return_sequences=1  # Set num_return_sequences to 1
    )

paraphrases = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Input length of input_ids is 128, but `max_length` is set to 128. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [None]:
print("Original Input Text:", input_text)
print("Generated Paraphrases:")
for paraphrase in paraphrases:
    print(paraphrase)


Original Input Text: تصوراتی طور پر کریم سکمنگ کی دو بنیادی جہتیں ہیں - مصنوعات اور جغرافیہ۔
Generated Paraphrases:
تصوراتی طور پر کریم سکمنگ کی دو بنیادی جہتیں ہیں - مصنوعات اور جغرافیہ ۔


In [None]:
save_path = '/content/drive/MyDrive/ur_paraphrasing/'
model.save_pretrained(save_path)