# **Text Preprocessing**


In [1]:
import re
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from tokenizers import Tokenizer
from collections import Counter

In [None]:
def clean_text(text):
    char_map = {
        'ñ': 'n', 'ā': 'a', 'ī': 'i', 'ū': 'u', 'ḳ': 'k', 'ġ': 'g', 'Ḍ': 'D', 'D': 'D', 'ṣ': 's', 'ż': 'z', 'ḷ': 'l',
        'ḥ': 'h', 'ṭ': 't', 'ḍ': 'd', 'ś': 's', 'ṁ': 'm', 'ṣ': 's', 'é': 'e', 'ó': 'o'
    }

    # Replace special characters
    for special_char, normal_char in char_map.items():
        text = text.replace(special_char, normal_char)

    # Remove dots, commas, quotes, and question marks
    text = re.sub(r'[.,"\'?]', '', text)

    # Remove extra spaces but preserve new lines
    text = re.sub(r'[ \t]+', ' ', text).strip()

    return text

# Read CSV file
df = pd.read_csv('Roman-Urdu-Poetry.csv')

# Apply cleaning function to the 'Poetry' column
df['Poetry'] = df['Poetry'].astype(str).apply(clean_text)

# Print cleaned dataframe
print(df.loc[0, 'Poetry'])
# Save the cleaned DataFrame to a new CSV file
df.to_csv('Cleaned_Roman_Urdu_Poetry.csv', index=False)

# Download the file to your PC
# from google.colab import files
# files.download('Cleaned_Roman_Urdu_Poetry.csv')



aankh se duur na ho dil se utar jaega 
vaqt ka kya hai guzarta hai guzar jaega 
itna manus na ho khalvat-e-gham se apni 
tu kabhi khud ko bhi dekhega to Dar jaega 
Dubte Dubte kashti ko uchhala de duun 
main nahin koi to sahil pe utar jaega 
zindagi teri ata hai to ye jaane vaala 
teri bakhshish tiri dahliz pe dhar jaega 
zabt lazim hai magar dukh hai qayamat ka faraz 
zalim ab ke bhi na roega to mar jaega


# With Charater Pair Encoding

In [None]:
# Character Tokenizer
def char_tokenizer(text):
    vocab = sorted(set(text))
    token_to_id = {char: idx + 1 for idx, char in enumerate(vocab)}  # Reserve 0 for padding
    token_to_id['<PAD>'] = 0
    return token_to_id

# Tokenize the entire dataset
all_text = ' '.join(df['Poetry'].dropna().tolist())
token_to_id = char_tokenizer(all_text)
vocab_size = len(token_to_id)

# Encode Poetry
def encode_poetry(poem):
    return [token_to_id.get(char, 0) for char in poem]  # 0 for unknown characters

poetry_data = df['Poetry'].dropna().tolist()
encoded_poetry = [encode_poetry(poem) for poem in poetry_data]

# Dataset Class
class ShayariDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.data[idx]
        return torch.tensor(seq[:-1]), torch.tensor(seq[1:])  # Input-Target pair

# Custom Collate Function for Padding
def collate_fn(batch):
    inputs, targets = zip(*batch)
    inputs_padded = pad_sequence(inputs, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=0)
    return inputs_padded, targets_padded

# DataLoader
dataset = ShayariDataset(encoded_poetry)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True, collate_fn=collate_fn)

# LSTM Model
class ShayariLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers, dropout=0.2):
        super(ShayariLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        output, hidden = self.lstm(x, hidden)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size, hidden_dim, num_layers, device):
        return (torch.zeros(num_layers, batch_size, hidden_dim).to(device),
                torch.zeros(num_layers, batch_size, hidden_dim).to(device))

# Model Initialization
embed_dim = 256
hidden_dim = 512
num_layers = 2
dropout = 0.3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ShayariLSTM(vocab_size, embed_dim, hidden_dim, num_layers, dropout).to(device)

# Training Loop
num_epochs = 100
learning_rate = 0.001
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(1, num_epochs + 1):
    model.train()
    hidden = model.init_hidden(batch_size=32, hidden_dim=hidden_dim, num_layers=num_layers, device=device)

    epoch_loss = 0
    epoch_correct = 0
    epoch_total = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch}/{num_epochs}", leave=False)

    for inputs, targets in progress_bar:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        hidden = tuple(h.detach() for h in hidden)

        output, hidden = model(inputs, hidden)
        output = output.view(-1, output.size(2))
        targets = targets.view(-1)

        loss = criterion(output, targets)
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(output, dim=1)
        mask = targets != 0
        correct_predictions = (predicted[mask] == targets[mask]).sum().item()
        total_valid_tokens = mask.sum().item()

        epoch_loss += loss.item()
        epoch_correct += correct_predictions
        epoch_total += total_valid_tokens

        progress_bar.set_postfix({
            'Loss': loss.item(),
            'Accuracy': f"{correct_predictions / total_valid_tokens:.4f}" if total_valid_tokens > 0 else "N/A"
        })

    avg_loss = epoch_loss / len(dataloader)
    epoch_accuracy = epoch_correct / epoch_total if epoch_total > 0 else 0
    print(f"Epoch {epoch}, Average Loss: {avg_loss:.4f}, Accuracy: {epoch_accuracy:.4f}")
    if epoch % 10 == 0:
        torch.save(model.state_dict(), f"shayari_lstm_epoch{epoch}.pth")
        torch.save(optimizer.state_dict(), f"optimizer_epoch{epoch}.pth")
        print(f"✅ Model saved at epoch {epoch}")

torch.save(model.state_dict(), "shayari_lstm_final.pth")
torch.save(optimizer.state_dict(), "optimizer_final.pth")
print("✅ Final model saved successfully!")



Epoch 1, Average Loss: 2.5046, Accuracy: 0.3100




Epoch 2, Average Loss: 1.9382, Accuracy: 0.4259




Epoch 3, Average Loss: 1.7655, Accuracy: 0.4681




Epoch 4, Average Loss: 1.6620, Accuracy: 0.4948




Epoch 5, Average Loss: 1.5920, Accuracy: 0.5132




Epoch 6, Average Loss: 1.5377, Accuracy: 0.5270




Epoch 7, Average Loss: 1.4982, Accuracy: 0.5364




Epoch 8, Average Loss: 1.4649, Accuracy: 0.5447




Epoch 9, Average Loss: 1.4391, Accuracy: 0.5516




Epoch 10, Average Loss: 1.4165, Accuracy: 0.5567
✅ Model saved at epoch 10




Epoch 11, Average Loss: 1.3913, Accuracy: 0.5635




Epoch 12, Average Loss: 1.3729, Accuracy: 0.5681




Epoch 13, Average Loss: 1.3562, Accuracy: 0.5723




Epoch 14, Average Loss: 1.3417, Accuracy: 0.5760




Epoch 15, Average Loss: 1.3281, Accuracy: 0.5796




Epoch 16, Average Loss: 1.3127, Accuracy: 0.5836




Epoch 17, Average Loss: 1.3020, Accuracy: 0.5862




Epoch 18, Average Loss: 1.2889, Accuracy: 0.5893




Epoch 19, Average Loss: 1.2791, Accuracy: 0.5926




Epoch 20, Average Loss: 1.2689, Accuracy: 0.5953
✅ Model saved at epoch 20




Epoch 21, Average Loss: 1.2561, Accuracy: 0.5987




Epoch 22, Average Loss: 1.2435, Accuracy: 0.6020




Epoch 23, Average Loss: 1.2343, Accuracy: 0.6048




Epoch 24, Average Loss: 1.2255, Accuracy: 0.6075




Epoch 25, Average Loss: 1.2134, Accuracy: 0.6110




Epoch 26, Average Loss: 1.2026, Accuracy: 0.6139




Epoch 27, Average Loss: 1.1927, Accuracy: 0.6171




Epoch 28, Average Loss: 1.1834, Accuracy: 0.6199




Epoch 29, Average Loss: 1.1708, Accuracy: 0.6235




Epoch 30, Average Loss: 1.1639, Accuracy: 0.6257
✅ Model saved at epoch 30




Epoch 31, Average Loss: 1.1495, Accuracy: 0.6299




Epoch 32, Average Loss: 1.1406, Accuracy: 0.6329




Epoch 33, Average Loss: 1.1297, Accuracy: 0.6360




Epoch 34, Average Loss: 1.1198, Accuracy: 0.6392




Epoch 35, Average Loss: 1.1073, Accuracy: 0.6430




Epoch 36, Average Loss: 1.0971, Accuracy: 0.6462




Epoch 37, Average Loss: 1.0863, Accuracy: 0.6492




Epoch 38, Average Loss: 1.0741, Accuracy: 0.6532




Epoch 39, Average Loss: 1.0639, Accuracy: 0.6563




Epoch 40, Average Loss: 1.0506, Accuracy: 0.6607
✅ Model saved at epoch 40




Epoch 41, Average Loss: 1.0395, Accuracy: 0.6641




Epoch 42, Average Loss: 1.0300, Accuracy: 0.6674




Epoch 43, Average Loss: 1.0181, Accuracy: 0.6714




Epoch 44, Average Loss: 1.0045, Accuracy: 0.6753




Epoch 45, Average Loss: 0.9962, Accuracy: 0.6779




Epoch 46, Average Loss: 0.9832, Accuracy: 0.6823




Epoch 47, Average Loss: 0.9738, Accuracy: 0.6853




Epoch 48, Average Loss: 0.9625, Accuracy: 0.6888




Epoch 49, Average Loss: 0.9500, Accuracy: 0.6929




Epoch 50, Average Loss: 0.9382, Accuracy: 0.6970
✅ Model saved at epoch 50




Epoch 51, Average Loss: 0.9283, Accuracy: 0.7003




Epoch 52, Average Loss: 0.9170, Accuracy: 0.7032




Epoch 53, Average Loss: 0.9060, Accuracy: 0.7076




Epoch 54, Average Loss: 0.8969, Accuracy: 0.7098




Epoch 55, Average Loss: 0.8890, Accuracy: 0.7125




Epoch 56, Average Loss: 0.8742, Accuracy: 0.7170




Epoch 57, Average Loss: 0.8667, Accuracy: 0.7195




Epoch 58, Average Loss: 0.8560, Accuracy: 0.7231




Epoch 59, Average Loss: 0.8466, Accuracy: 0.7258




Epoch 60, Average Loss: 0.8374, Accuracy: 0.7290
✅ Model saved at epoch 60




Epoch 61, Average Loss: 0.8274, Accuracy: 0.7319




Epoch 62, Average Loss: 0.8190, Accuracy: 0.7350




Epoch 63, Average Loss: 0.8076, Accuracy: 0.7381




Epoch 64, Average Loss: 0.8000, Accuracy: 0.7404




Epoch 65, Average Loss: 0.7901, Accuracy: 0.7440




Epoch 66, Average Loss: 0.7822, Accuracy: 0.7460




Epoch 67, Average Loss: 0.7748, Accuracy: 0.7487




Epoch 68, Average Loss: 0.7669, Accuracy: 0.7510




Epoch 69, Average Loss: 0.7578, Accuracy: 0.7538




Epoch 70, Average Loss: 0.7532, Accuracy: 0.7551
✅ Model saved at epoch 70




Epoch 71, Average Loss: 0.7438, Accuracy: 0.7578




Epoch 72, Average Loss: 0.7379, Accuracy: 0.7602




Epoch 73, Average Loss: 0.7283, Accuracy: 0.7630




Epoch 74, Average Loss: 0.7213, Accuracy: 0.7652




Epoch 75, Average Loss: 0.7165, Accuracy: 0.7665




Epoch 76, Average Loss: 0.7114, Accuracy: 0.7685




Epoch 77, Average Loss: 0.7043, Accuracy: 0.7710




Epoch 78, Average Loss: 0.6965, Accuracy: 0.7727




Epoch 79, Average Loss: 0.6903, Accuracy: 0.7747




Epoch 80, Average Loss: 0.6848, Accuracy: 0.7765
✅ Model saved at epoch 80




Epoch 81, Average Loss: 0.6780, Accuracy: 0.7787




Epoch 82, Average Loss: 0.6735, Accuracy: 0.7800




Epoch 83, Average Loss: 0.6666, Accuracy: 0.7823




Epoch 84, Average Loss: 0.6581, Accuracy: 0.7849




Epoch 85, Average Loss: 0.6561, Accuracy: 0.7849




Epoch 86, Average Loss: 0.6491, Accuracy: 0.7872




Epoch 87, Average Loss: 0.6446, Accuracy: 0.7894




Epoch 88, Average Loss: 0.6383, Accuracy: 0.7909




Epoch 89, Average Loss: 0.6341, Accuracy: 0.7920




Epoch 90, Average Loss: 0.6287, Accuracy: 0.7935
✅ Model saved at epoch 90




Epoch 91, Average Loss: 0.6231, Accuracy: 0.7954




Epoch 92, Average Loss: 0.6191, Accuracy: 0.7967




Epoch 93, Average Loss: 0.6152, Accuracy: 0.7979




Epoch 94, Average Loss: 0.6101, Accuracy: 0.7994




Epoch 95, Average Loss: 0.6085, Accuracy: 0.8000




Epoch 96, Average Loss: 0.6030, Accuracy: 0.8017




Epoch 97, Average Loss: 0.5976, Accuracy: 0.8034




Epoch 98, Average Loss: 0.5940, Accuracy: 0.8043




Epoch 99, Average Loss: 0.5893, Accuracy: 0.8055


                                                                                           

Epoch 100, Average Loss: 0.5860, Accuracy: 0.8069
✅ Model saved at epoch 100
✅ Final model saved successfully!




In [None]:
# Load Model
model = ShayariLSTM(vocab_size, embed_dim, hidden_dim, num_layers, dropout).to(device)
model.load_state_dict(torch.load("shayari_lstm_final.pth"))  # Load saved weights
model.eval()  # Set model to evaluation mode

# Text Generation Function
def generate_text(start_seq, gen_length=50):
    model.eval()  # Ensure model is in evaluation mode
    tokens = [token_to_id.get(char, 0) for char in start_seq]
    input_seq = torch.tensor(tokens).unsqueeze(0).to(device)
    hidden = model.init_hidden(1, hidden_dim, num_layers, device)

    generated = tokens

    with torch.no_grad():
        for _ in range(gen_length):
            output, hidden = model(input_seq, hidden)
            prob = torch.softmax(output[:, -1, :], dim=-1)
            next_token = torch.argmax(prob, dim=-1).item()

            generated.append(next_token)
            input_seq = torch.tensor([[next_token]]).to(device)

    # Convert IDs back to characters
    id_to_token = {v: k for k, v in token_to_id.items()}
    return ''.join([id_to_token.get(token, '') for token in generated])

# Generate Sample Shayari
sample_shayari = generate_text("tujh pe uthi hain", gen_length=300)
print("\nGenerated Shayari:\n", sample_shayari)



Generated Shayari:
 tujh pe uthi hain sab ke sab khvab bhi ho gaya 
us ki tegh-e-nigah ko abhi tabassum uzr 
yuun to sab kuchh andaz-e-gul-e-tar ka taara 
ki jaise abru-e-asar-e-bahar se vo sun 
main apne aap men sar tha aur bulandi tu 
vasl bhi saaya surashon ka arman ho gaya hasil 
abhi faqirana na main nashsha-e-mai bahut thi 
main 
