In [99]:
# The code is importing various modules and helper functions from different files.

import re
import time

from bs4 import BeautifulSoup, NavigableString, ResultSet, Tag

import rokomari_scrap_helper as rsh
from helpers import common_helper as ch
from helpers import json_helper as jh
from helpers import scraping_helper as sh
from models.book import Book
from models.book_category import BookCategory

In [100]:
book_category_list: list[BookCategory] = rsh.get_book_category_list()

print("Total Book Categories Count: ", book_category_list.__len__())

dynamic_book_category_list = rsh.get_book_categories_containing_url_segment(
    book_category_list, "/book/category"
)
print("Total Dynamic Book Categories Count: ", dynamic_book_category_list.__len__())

static_book_category_list = rsh.get_book_categories_not_containing_url_segment(
    book_category_list, "/book/category"
)
print("Total Static Book Categories Count: ", static_book_category_list.__len__())

Total Book Categories Count:  58
Total Dynamic Book Categories Count:  54
Total Static Book Categories Count:  4


In [101]:
# iterate over dynamic_book_category_list to get the desired book list from all available categories
book_category = dynamic_book_category_list[0]

pagination_response = sh.get_http_response(book_category.url)

book_list: list[Book] = []

if pagination_response.status_code != 200:
    book_list = []
    
else:
    pagination_soup = sh.parse_html_content_as_string(pagination_response.text)

In [102]:
pagination_element = sh.find_one_by_class_name(pagination_soup, 'div', 'pagination')

pagination_list = pagination_element.text.strip().split()
numbers = [int(x) for x in pagination_list if x.isdigit()]
largest_page_number = max(numbers)
print(f"Total Page Numbers: {largest_page_number}")

Total Page Numbers: 167


In [104]:
book_list: list[Book] = []

for value in range(largest_page_number):
    list_page_url = f"{book_category.url}&page={value+1}"

    book_list_page_response = sh.get_http_response(list_page_url)

    book_card_list: list[Tag] = []


    if book_list_page_response.status_code != 200:
        book_list = book_list


    else:

        book_list_page_soup = sh.parse_html_content_as_string(
            book_list_page_response.text
        )


        book_card_list: list[Tag] = sh.find_all_by_class_name(
            book_list_page_soup, "div", "books-wrapper__item"
        )

        for book_card_item in book_card_list:

            # book_url
            tag_item = sh.find_all_by_tag_name(book_card_item, "a")[1]
            all_book_urls = sh.get_value_of_attributes([tag_item], "href")
            book_url = rsh.base_url + all_book_urls[0].get("value").strip()

            # image_url
            book_image_item = sh.find_all_by_tag_name(book_card_item, "img")[1]
            all_image_urls = sh.get_value_of_attributes([book_image_item], "src")
            image_url = rsh.base_url + all_image_urls[0].get("value").strip()

            # book-title
            book_title_item = sh.find_one_by_class_name(
                book_card_item, "h4", "book-title"
            )
            book_title = book_title_item.text.strip()

            # book-author
            book_author_item = sh.find_one_by_class_name(
                book_card_item, "p", "book-author"
            )
            book_author = book_author_item.text.strip()

            # book-status text-capitalize
            book_available_item = sh.find_one_by_class_name(
                book_card_item, "p", "book-status text-capitalize"
            )
            is_book_available = (
                book_available_item.text.strip().upper() == "Product in stock".upper()
            )

            # book-price
            has_discount = False
            book_original_price = 0
            book_current_price = 0

            if book_card_item.text.strip().upper().count("TK.") == 1:
                has_discount = True

                try:
                    # original-price pl-2
                    book_original_price_item = sh.find_one_by_class_name(
                        book_card_item, "strike", "original-price pl-2"
                    )
                    book_original_price_text = (
                        book_original_price_item.text.strip().split()[1]
                    )

                    book_original_price = (float)(
                        book_original_price_text.replace(",", "")
                    )
                    book_current_price = book_original_price
                except:
                    book_original_price = 0

                try:
                    book_current_price_text = (
                        book_card_item.select_one(".book-price").contents[-1].strip()
                    )

                    book_current_price = (float)(
                        book_current_price_text.split()[-1].replace(",", "")
                    )
                except:
                    book_current_price = 0

                book_original_price = book_current_price
            else:
                has_discount = False

                try:
                    # original-price pl-2
                    book_original_price_item = sh.find_one_by_class_name(
                        book_card_item, "strike", "original-price pl-2"
                    )
                    book_original_price_text = (
                        book_original_price_item.text.strip().split()[1]
                    )
                    book_original_price = (float)(
                        book_original_price_text.replace(",", "")
                    )
                except:
                    book_original_price = 0

                try:
                    book_current_price_text = (
                        book_card_item.select_one(".book-price").contents[-1].strip()
                    )

                    book_current_price = (float)(
                        book_current_price_text.split()[-1].replace(",", "")
                    )
                except:
                    book_current_price = 0

            book = Book(
                title=book_title,
                author=book_author,
                isAvailable=is_book_available,
                originalPrice=book_original_price,
                currentPrice=book_current_price,
                imageUrl=image_url,
                bookUrl=list_page_url,
                category=book_category,
            )

            book_list.append(book)

print(f"Books Count: {book_list.__len__()}")
print(f"Books: \n {jh.data_to_json_string(book_list)}")

Books Count: 9960
Books: 
 [
    {
        "title": "কল সেন্টারের অপরাজিতা",
        "author": "রাহিতুল ইসলাম",
        "isAvailable": true,
        "originalPrice": 200.0,
        "currentPrice": 150.0,
        "imageUrl": "https://www.rokomari.comhttps://ds.rokomari.store/rokomari110/ProductNew20190903/130X186/book_av.png",
        "bookUrl": "https://www.rokomari.com/book/category/1983/extra-discount?ref=act_pg0_p0&page=1",
        "category": {
            "name": "অতিরিক্ত ছাড়",
            "url": "https://www.rokomari.com/book/category/1983/extra-discount?ref=act_pg0_p0"
        }
    },
    {
        "title": "গ্রিনিট",
        "author": "যুবায়ের আহমেদ",
        "isAvailable": true,
        "originalPrice": 300.0,
        "currentPrice": 225.0,
        "imageUrl": "https://www.rokomari.comhttps://ds.rokomari.store/rokomari110/ProductNew20190903/130X186/book_av.png",
        "bookUrl": "https://www.rokomari.com/book/category/1983/extra-discount?ref=act_pg0_p0&page=1",
        "ca