In [2]:
import os
import requests
from bs4 import BeautifulSoup, element as el

In [4]:
ROOT_URL = 'https://www.tagoreweb.in/'
ROOT_DIR = 'tagoreweb.in'
ROOT_PATH = os.path.join('../../data', ROOT_DIR)
if not os.path.exists(ROOT_PATH):
    os.mkdir(ROOT_PATH)

In [7]:
import json
import re
from time import sleep
from urllib.parse import urljoin

def scrap_alphabetic(writing_type: str):
    assert writing_type in ('Verses', 'Songs'), writing_type

    content_path = os.path.join(ROOT_PATH, writing_type)
    if not os.path.exists(content_path):
        os.mkdir(content_path)

    url = urljoin(ROOT_URL, f'{writing_type}/alphabetic-index')
    anchors = get_anchors_from_index(url)
    bn_anchors = [a for a in anchors if ord(a.text.strip()) > 256]
    bn_links = {
        bn_anchor.text.strip(): urljoin(ROOT_URL, bn_anchor.get('href'))
        for bn_anchor in bn_anchors
    }
    for key, bn_link in bn_links.items():
        poem_anchors = get_anchors_from_index(bn_link)
        poem_links = [urljoin(ROOT_URL, poem_anchor.get('href')) for poem_anchor in poem_anchors]
        for poem_link in poem_links:
            poem_text = requests.get(poem_link).text
            poem_soup = BeautifulSoup(poem_text)
            content_div = poem_soup.find_all('div', attrs={'class': 'content-right'})
            assert len(content_div) == 1, len(content_div)
            content_div: el.Tag = content_div[0]

            # get title
            h2 = content_div.find_all('h2')
            assert len(h2) == 1, len(h2)
            title = h2[0].text.strip()
            title = re.sub('\\s+', ' ', title)

            # remove headers
            content_div.find_all(re.compile('^h[1-6]$')).clear()

            # finalize data
            content = content_div.get_text(separator='\n').strip()
            data_dict = {'title': title, 'author': 'রবীন্দ্রনাথ ঠাকুর', 'url': poem_link, 'content': content}

            # write to file
            filename = list(filter(bool, poem_link.split('/')))[-1]
            file_path = os.path.join(content_path, f'{filename}.json')
            with open(file_path, 'w', encoding='utf-8') as content_file:
                json.dump(data_dict, content_file, ensure_ascii=False)

        print('Scrapped:', key)
        sleep(30)


def get_anchors_from_index(url: str) -> el.ResultSet[el.Tag]:
    index_page_text = requests.get(url).text
    index_soup = BeautifulSoup(index_page_text)
    index_div = index_soup.find_all('div', attrs={'class': 'suchi_patra_area'})
    assert len(index_div) == 1, len(index_div)
    index_div: el.Tag = index_div[0]
    links = index_div.find_all('a')
    return links

In [None]:
scrap_alphabetic('Verses')

In [5]:
print(f'Total {sum(len(files) for *_, files in os.walk(os.path.join(ROOT_PATH, "Verses")))} entries scrapped')

Total 3233 entries scrapped


In [8]:
scrap_alphabetic('Songs')

Scrapped: অ
Scrapped: আ
Scrapped: ই
Scrapped: উ
Scrapped: এ
Scrapped: ও
Scrapped: ক
Scrapped: খ
Scrapped: গ
Scrapped: ঘ
Scrapped: চ
Scrapped: ছ
Scrapped: জ
Scrapped: ঝ
Scrapped: ঠ
Scrapped: ড
Scrapped: ঢ
Scrapped: ত
Scrapped: থ
Scrapped: দ
Scrapped: ধ
Scrapped: ন
Scrapped: প
Scrapped: ফ
Scrapped: ব
Scrapped: ভ
Scrapped: ম
Scrapped: য
Scrapped: র
Scrapped: ল
Scrapped: শ
Scrapped: স
Scrapped: হ


In [9]:
print(f'Total {sum(len(files) for *_, files in os.walk(os.path.join(ROOT_PATH, "Songs")))} entries scrapped')

Total 2272 entries scrapped
