Cleaning the dataset

In [2]:
import pandas as pd
import tensorflow as tf
import numpy as np
import os

In [None]:
# 1. Load and Preprocess Data (Modified for Couplet Dataset)
df = pd.read_csv("Ghazal_ur.csv")  # Update path

# Handle missing values and combine couplets
df = df.dropna(subset=['misra1', 'misra2'])  # Remove rows with missing lines

# Create a continuous text corpus from couplets
text_entries = []
for _, row in df.iterrows():
    text_entries.extend([row['misra1'], row['misra2']])
urdu_text = '\n'.join(text_entries)  # Each line is either misra1 or misra2

# Build vocabulary from the combined text
vocab = sorted(set(urdu_text))
char_to_index = {u: i for i, u in enumerate(vocab)}
index_to_char = np.array(vocab)
text_as_int = np.array([char_to_index[c] for c in urdu_text])


char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(100, drop_remainder=True)
dataset = sequences.map(lambda chunk: (chunk[:-1], chunk[1:])).batch(64, drop_remainder=True)

In [4]:
from tensorflow.keras import metrics
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 1024

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GRU(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(vocab_size)
])

def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss, metrics=[metrics.SparseCategoricalAccuracy()])


In [5]:
# Set up checkpoints
checkpoint_dir = './training_checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}.weights.h5")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True
)

In [6]:
EPOCHS = 30
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

# 4. Text Generation (Same with Urdu Script Support)
model_2 = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.GRU(rnn_units, return_sequences=True, stateful=True),
    tf.keras.layers.Dense(vocab_size)
])

model_2.build(tf.TensorShape([1, None]))

Epoch 1/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 58ms/step - loss: 3.2018 - sparse_categorical_accuracy: 0.2652
Epoch 2/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 56ms/step - loss: 2.2610 - sparse_categorical_accuracy: 0.3670
Epoch 3/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step - loss: 2.0720 - sparse_categorical_accuracy: 0.4066
Epoch 4/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 58ms/step - loss: 1.9402 - sparse_categorical_accuracy: 0.4365
Epoch 5/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 59ms/step - loss: 1.8494 - sparse_categorical_accuracy: 0.4594
Epoch 6/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - loss: 1.7802 - sparse_categorical_accuracy: 0.4768
Epoch 7/30
[1m145/145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 60ms/step - loss: 1.7222 - sparse_categorical_accuracy: 0.4915
Epoch 8/30
[1m145/

In [11]:
import re
checkpoint_files = [f for f in os.listdir(checkpoint_dir) if f.endswith(".weights.h5")]

# Sort by epoch number (numeric part of filename)
checkpoint_files.sort(key=lambda f: int(re.search(r'ckpt_(\d+)', f).group(1)))

print(checkpoint_files)
if checkpoint_files:
    latest_checkpoint = os.path.join(checkpoint_dir, checkpoint_files[-1])
    model_2.load_weights(latest_checkpoint)
    print(f"Loaded weights from: {latest_checkpoint}")
else:
    print("No checkpoint found!")
    exit()

['ckpt_1.weights.h5', 'ckpt_2.weights.h5', 'ckpt_3.weights.h5', 'ckpt_4.weights.h5', 'ckpt_5.weights.h5', 'ckpt_6.weights.h5', 'ckpt_7.weights.h5', 'ckpt_8.weights.h5', 'ckpt_9.weights.h5', 'ckpt_10.weights.h5', 'ckpt_11.weights.h5', 'ckpt_12.weights.h5', 'ckpt_13.weights.h5', 'ckpt_14.weights.h5', 'ckpt_15.weights.h5', 'ckpt_16.weights.h5', 'ckpt_17.weights.h5', 'ckpt_18.weights.h5', 'ckpt_19.weights.h5', 'ckpt_20.weights.h5', 'ckpt_21.weights.h5', 'ckpt_22.weights.h5', 'ckpt_23.weights.h5', 'ckpt_24.weights.h5', 'ckpt_25.weights.h5', 'ckpt_26.weights.h5', 'ckpt_27.weights.h5', 'ckpt_28.weights.h5', 'ckpt_29.weights.h5', 'ckpt_30.weights.h5']
Loaded weights from: ./training_checkpoints/ckpt_30.weights.h5


In [20]:
# Generate poetry with Urdu script seed
num_generate = 130
seed_text = "دل کی بات لبوں پہ نہ لانا کبھی"  #

# Clean seed text using dataset vocabulary
seed_text = ''.join([c if c in char_to_index else ' ' for c in seed_text])
input_eval = [char_to_index[c] for c in seed_text]
input_eval = tf.expand_dims(input_eval, 0)

model_2.layers[1].reset_states()

text_generated = [seed_text]
for _ in range(num_generate):
    predictions = model_2.predict(input_eval, verbose=0)
    predictions = tf.squeeze(predictions, 0)
    predicted_id = np.argmax(predictions[-1])
    text_generated.append(index_to_char[predicted_id])
    input_eval = tf.expand_dims([predicted_id], 0)

generated_poetry = ''.join(text_generated)
print("\n📜 Generated Poetry:\n")
print(generated_poetry)


📜 Generated Poetry:

دل کی بات لبوں پہ نہ لانا کبھی تمام شب
دل خواب جو پنساں در بار دوست 
دور سبز دشت عشق تو دیکھوں گا جسے آہ تو مر چھائے ہوئے
احباب پا سفر در کیا ہماریاں
دشت میں جا
