In [None]:
import requests
from collections import deque
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from rich.progress import Progress, BarColumn, TextColumn, TimeElapsedColumn, TimeRemainingColumn

Base_Url = "https://www.rekhta.org/"                                        # Base Url
Allowed_Keyword = "shayari"                                                 # Only Poetry Sites
Excluded_Keyword = "lang"                                                   # Only Urdu
Max_Pages = 300                                                             # Maximun Pages Crawl

# Normalize Url For Identity Keywords
def Normalize_Url(Url):
    Parsed = urlparse(Url)
    Scheme = Parsed.scheme
    Netloc = Parsed.netloc.lower()
    Path = Parsed.path.rstrip('/')
    return f"{Scheme}://{Netloc}{Path}"

# Extract Poetry After inspection HTML
def Extract_Poetry(Url):
    Response = requests.get(Url, timeout=10)
    if Response.status_code != 200:
        return []
    soup = BeautifulSoup(Response.content, 'html.parser')
    Sher_Sections = soup.find_all('div', class_='sherSection')
    Poetry_Data = []
    for Section in Sher_Sections:
        Roman_Lines = Section.find('div', {'data-roman': 'on'})
        if Roman_Lines:
            Lines = Roman_Lines.find_all('p')
            Sher = []
            for Line in Lines:
                Spans = Line.find_all('span')
                Line_Roman_Text = " ".join(Span.get_text(strip=True) for Span in Spans)
                Sher.append(Line_Roman_Text.strip())
            Poetry_Data.append("\n".join(Sher))
    return Poetry_Data

# Crawling Websites Function
def Crawl_Website(Start_Url):
    Normalized_Start = Normalize_Url(Start_Url)
    Normalized_Base = Normalize_Url(Base_Url)

    Seen = set([Normalized_Start])
    Queue = deque([Normalized_Start])
    All_Poetry = []
    Crawled_Pages = 0

    with Progress(
        TextColumn("Crawling : "),  # Show crawling URL
        BarColumn(),
        TextColumn("{task.percentage:>3.0f}%"),
    ) as Progress_Indicator:

        Progress_Task = Progress_Indicator.add_task("Progress", total=Max_Pages)

        while Queue and Crawled_Pages < Max_Pages:
            Url = Queue.popleft()
            try:
                response = requests.get(Url, timeout=10)
                if response.status_code != 200:
                    continue
                soup = BeautifulSoup(response.content, 'html.parser')
                Poetry_Data = Extract_Poetry(Url)
                All_Poetry.extend(Poetry_Data)
                Crawled_Pages += 1

                # Find All Link on that Page
                for Link in soup.find_all('a', href=True):
                    Full_Url = urljoin(Url, Link['href'])
                    Clean_Url = Normalize_Url(Full_Url)
                    # Conditions to follow the link:
                    if (
                        Clean_Url not in Seen and
                        Allowed_Keyword in Clean_Url and
                        Excluded_Keyword not in Clean_Url and
                        Clean_Url.startswith(Normalized_Base)
                    ):
                        Seen.add(Clean_Url)
                        Queue.append(Clean_Url)
                Progress_Indicator.update(Progress_Task, advance=1)
            except requests.RequestException as e:
                print(f"Request failed: {Url} - {e}")
    return All_Poetry

# Start Crawling
All_Poetry = Crawl_Website(Base_Url)

# Complete Crawling
print(f"Rekhta.org Poetry Scraped Successfully!")

# Save Scraped Poetry to a .txt File
File_Path = "RekhtaScrapedPoetry.txt"
with open(File_Path, "w", encoding="utf-8") as File:
    File.write("\n".join(All_Poetry))

print(f"Scraped Poetry Saved Successfully to {File_Path}")

Output()

Rekhta.org Poetry Scraped Successfully!
Scraped Poetry Saved Successfully to RekhtaScrapedPoetry.txt


In [None]:
import re

# Clean Scraped Poetry Text
def Clean_Poetry(RekhtaScrapedPoetry, RekhtaCleanedPoetry):
    with open(RekhtaScrapedPoetry, "r", encoding="utf-8") as f:
        ScrapedPoetryLines = f.readlines()                                                                  # Read all lines

    CleanedPoetry = " ".join(Line.strip() for Line in ScrapedPoetryLines if Line.strip())                   # Remove lines and Join in Single String
    CleanedPoetry = CleanedPoetry.lower()                                                                   # Convert to Lowercase
    CleanedPoetry = re.sub(r'\s+', ' ', CleanedPoetry).strip()                                              # Remove Extra Spaces
    CleanedPoetry = re.sub(r'[^\w\s]', '', CleanedPoetry)                                                   # Remove Special characters

    # Save cleaned text to output file
    with open(RekhtaCleanedPoetry, "w", encoding="utf-8") as File:
        File.write(CleanedPoetry)

    print(f"Rekhta.org Poetry Cleaned Successfully!")
    print(f"Cleaned Poetry Saved Successfully to {RekhtaCleanedPoetry}")


ScrapedPoetryFile_Path = "RekhtaScrapedPoetry.txt";
CleanedPoetryFile_Path = "RekhtaCleanedPoetry.txt";

Clean_Poetry(ScrapedPoetryFile_Path, CleanedPoetryFile_Path)

Rekhta.org Poetry Cleaned Successfully!
Cleaned Poetry Saved Successfully to RekhtaCleanedPoetry.txt


In [None]:
import pickle
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer                                         # type: ignore
from tensorflow.keras.preprocessing.sequence import pad_sequences                                 # type: ignore
from rich.progress import Progress, BarColumn, TextColumn

with open("RekhtaCleanedPoetry.txt", "r", encoding="utf-8") as File:
    CleanedPoetry = File.read()                                                                   # Read Cleaned Poetry

Maximum_Poetry_Words = 5000 + 1
CleanedPoetryWords = CleanedPoetry.split()
CleanedPoetry = " ".join(CleanedPoetryWords[:Maximum_Poetry_Words])

# Tokenization
PoetryTokenizer = Tokenizer()
PoetryTokenizer.fit_on_texts([CleanedPoetry])
Total_Words = len(PoetryTokenizer.word_index) + 1                                                 # add 1 for Padding Token

# Convert Text to Sequences
Input_Sequences = []
Words = CleanedPoetry.split()

with Progress(
        TextColumn("Tokenizing : "),  # Show crawling URL
        BarColumn(),
        TextColumn("{task.percentage:>3.0f}%"),
    ) as Progress_Indicator:
        Progress_Task = Progress_Indicator.add_task("Progress", total=len(Words))

        for i in range(1, len(Words)):
          n_gram_sequence = Words[:i+1]                                                                 # Create N-Grams
          encoded = PoetryTokenizer.texts_to_sequences([" ".join(n_gram_sequence)])[0]
          Input_Sequences.append(encoded)
          Progress_Indicator.update(Progress_Task, advance=1)

# Set Max Sequence Length to 50 | Increase For Better Result
Max_Sequence_Length = min(50, max([len(seq) for seq in Input_Sequences]))

# Padding Sequences
Input_Sequences = pad_sequences(Input_Sequences, maxlen=Max_Sequence_Length, padding="pre")

# Split into Features 'X' and Labels 'y'
X, Y = Input_Sequences[:, :-1], Input_Sequences[:, -1]
Y = np.array(Y)

print(f"Total Unique Words : {Total_Words}")
print(f"Max Sequence Length : {Max_Sequence_Length}")
print(f"Number of Training Samples : {len(X)}")


Tokenizer_Path = "RomanUrduPoetryTokenizer.pkl"
with open(Tokenizer_Path, "wb") as File:
    pickle.dump(PoetryTokenizer, File)
print(f"Poetry Tokenizer Saved Successfully to: {Tokenizer_Path}")

Output()

Total Unique Words : 1106
Max Sequence Length : 50
Number of Training Samples : 5000
Poetry Tokenizer Saved Successfully to: RomanUrduPoetryTokenizer.pkl


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential                                                  # type: ignore
from tensorflow.keras.layers import Embedding, GRU, Dense                                       # type: ignore

# Set GRU Model
model = Sequential([
    Embedding(Total_Words, 100),                          # Word Embeddings
    GRU(256, return_sequences=True),                                                            # First GRU layer
    GRU(256),                                                                                   # Second GRU layer
    Dense(Total_Words, activation="softmax")                                                    # The Output layer with Softmax
])

# Compile the Model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the Model
epochs = 50                                                                                    # Increase for Better Result
history = model.fit(X, Y, epochs=epochs, verbose=1)

# Save the Model
Model_Path = "RomanUrduPoetryModel.keras"
model.save(Model_Path)

print(f"Poetry Generator Model Saved Successfully to: {Model_Path}")

Epoch 1/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 349ms/step - accuracy: 0.0359 - loss: 6.4830
Epoch 2/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 340ms/step - accuracy: 0.0423 - loss: 5.8254
Epoch 3/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 336ms/step - accuracy: 0.0658 - loss: 5.4406
Epoch 4/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 344ms/step - accuracy: 0.1057 - loss: 4.9427
Epoch 5/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 340ms/step - accuracy: 0.1869 - loss: 4.1473
Epoch 6/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 339ms/step - accuracy: 0.3026 - loss: 3.3608
Epoch 7/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 344ms/step - accuracy: 0.4595 - loss: 2.5477
Epoch 8/50
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 345ms/step - accuracy: 0.6203 - loss: 1.8745
Epoch 9/50
[1m1

In [None]:
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences                                 # type: ignore

# Load the Poetry Trained Model Form Saved File
model = tf.keras.models.load_model("RomanUrduPoetryModel.keras")

# Load Poetry Tokenizer From Saved File
with open("RomanUrduPoetryTokenizer.pkl", "rb") as File:
    PoetryTokenizer = pickle.load(File)

# Set Max Sequence Length
Max_Sequence_Length = 50

# Generated Poetry Function
def Generate_Poetry(Seed_Text, Next_Words_Length = 20, Temperature = 0.8):
    Token_List = PoetryTokenizer.texts_to_sequences([Seed_Text])[0]

    for i in range(Next_Words_Length):
        Token_List = pad_sequences([Token_List], maxlen=Max_Sequence_Length - 1, padding="pre")

        Predictions = model.predict(Token_List, verbose=0)[0]
        Predictions = np.log(Predictions + 1e-7) / Temperature
        Exp_Preds = np.exp(Predictions)
        Probabilities = Exp_Preds / np.sum(Exp_Preds)

        Predicted = np.random.choice(len(Probabilities), p=Probabilities)

        Output_Poetry_Word = next((Word for Word, index in PoetryTokenizer.word_index.items() if index == Predicted), "")
        if not Output_Poetry_Word:
            break

        Seed_Text += " " + Output_Poetry_Word
        Token_List = np.append(Token_List, Predicted)
    return Seed_Text

# Get Generated Poetry
SeedText = "Dil jo tuta"
Generated_Poetry = Generate_Poetry(Seed_Text = SeedText, Next_Words_Length = 20, Temperature = 0.8)
print(f"Seed Text : {SeedText}")
print(f"Generated Poetry : {Generated_Poetry}")

Seed Text : Dil jo tuta
Generated Poetry : Dil jo tuta wahshaten hisse mein apne aai hain ki tere ghar bhi pahunch kar sakun na paen hum kaisa firaq kaisi judai


In [None]:
import os
import pickle
import numpy as np
import gradio as gr
import tensorflow as tf
from gradio.themes.base import Base
from tensorflow.keras.preprocessing.sequence import pad_sequences                                 # type: ignore

# Load the Trained Model
model = tf.keras.models.load_model("RomanUrduPoetryModel.keras")

# Load Poetry Tokenizer
with open("RomanUrduPoetryTokenizer.pkl", "rb") as File:
    PoetryTokenizer = pickle.load(File)

# Set Max Sequence Length
Max_Sequence_Length = 50

def Generate_Poetry(Seed_Text, Poetry_Words_Length=20, Temperature=0.5):
    Token_List = PoetryTokenizer.texts_to_sequences([Seed_Text])[0]

    for i in range(Poetry_Words_Length):

        Token_List = pad_sequences([Token_List], maxlen=Max_Sequence_Length - 1, padding="pre")

        Predictions = model.predict(Token_List, verbose=0)[0]
        Predictions = np.log(Predictions + 1e-7) / Temperature
        Exp_Preds = np.exp(Predictions)
        Probabilities = Exp_Preds / np.sum(Exp_Preds)

        Predicted = np.random.choice(len(Probabilities), p=Probabilities)

        output_word = next((word for word, index in PoetryTokenizer.word_index.items() if index == Predicted), "")
        if not output_word:
            break

        Seed_Text += " " + output_word
        Token_List = np.append(Token_List, Predicted)                           # Update token list

    # Create Genertated Poetry File
    File_Path = "ShaayerGeneratedPoetry.txt"
    with open(File_Path, "w", encoding="utf-8") as File:
        File.write(Seed_Text)

    return Seed_Text, File_Path

# Customize UI
class Seafoam(Base):
    pass
seafoam = Seafoam(font=gr.themes.GoogleFont("Plus Jakarta Sans"))

style ="""
    .gradio-primary-button {
        background: #007bff;
        color: white;
        font-weight: bold;
        border: none;
        border-radius: 20px;
    }
    .gradio-primary-button:hover {
        background: #0056b3;
    }
    .gradio-dropdown {
        background: #00000000;
    }
    .gradio-secondary-button {
        background: transparent;
        border: 1.5px solid var(--input-border-color);
        font-weight: bold;
        border-radius: 20px;
    }
    .gradio-secondary-button:hover {
        background: var(--input-border-color);
    }
    label.container.show_textbox_border.svelte-173056l textarea.svelte-173056l {
        background:transparent;
        border-radius: 20px;
    }
    div.svelte-633qhp {
        border-radius: 15px;
        overflow-y: hidden;
    }
    span.svelte-1gfkn6j {
        padding-left: 20px,
        font-size:16px;
        font-weight: bold;
    }
    .gradio-container.gradio-container-5-16-0 .contain span.svelte-1gfkn6j {
        padding-left: 12px;
    }
    .icon-button-wrapper.hide-top-corner.svelte-1jx2rq3 {
        border-radius: 20px;
        margin: 5px 6.09px 0px 0px;
        padding: 6px 5.5px 5px 5.5px;
    }
    label.svelte-173056l.svelte-173056l {
        display: block;
        width: 100%;
        padding-left: 10px;
    }
"""

# Gradio Interface with Better UI
with gr.Blocks(theme=seafoam, css=style) as app:

    gr.Markdown("# Shaayer")

    with gr.Row():

        seed_input = gr.Textbox(label="Poetry Seed", placeholder="Enter your poetry seed here ...")
        num_words = gr.Slider(10, 50, step=5, label="Number of Words", value=20)
        temp = gr.Slider(0.2, 1.0, step=0.1, label="Creativity (Temperature)", value=0.5)

    poetry_output = gr.Textbox(label="Generated Poetry", placeholder="Generated Poetry will appear here ...")
    download_btn = gr.DownloadButton("Download Generated Poetry", value="generated_poetry.txt", visible=False, elem_classes=["gradio-secondary-button"])

    generate_button = gr.Button("Generate", variant="primary", elem_classes=["gradio-primary-button"])

    def generate_download_links(seed_input, num_words, temp):
        poetry_output, text_file = Generate_Poetry(seed_input, num_words, temp)
        return poetry_output, gr.update(value=text_file, visible=True)

    generate_button.click(generate_download_links, inputs=[seed_input, num_words, temp], outputs=[poetry_output, download_btn])

app.launch(share=True, inbrowser=True)