In [19]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from requests_html import HTMLSession
from tqdm import tqdm
import time
import json

In [25]:
def extract_text(element):
    if element.name == "p":
        return element.text.replace("\n", " ").strip()
    else:
        return None

driver = webdriver.Chrome()

session = HTMLSession()

url = "https://tamil.oneindia.com/topic/malaysia?page-no=2"
driver.get(url)

# total number of topics to scrape
total_topics = 100

# handle scroll
print("Scrolling to load more topics... Please wait")
while len(driver.find_elements("class name", "cityblock-desc")) < total_topics:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(2)

print(f"Scrolled to display {total_topics} topics. Proceeding with scraping..")

page_source = driver.page_source

soup = BeautifulSoup(page_source, "html.parser")
entries = soup.find_all("div", attrs={"class": "cityblock-desc"})

data = []
visited_urls = set()

for entry in tqdm(entries, desc="Scraping progress: ", leave=False):
    topic_url = 'https://tamil.oneindia.com' + entry.find("a").get("href")

    # check if the URL has already been processed
    if topic_url not in visited_urls:
        topic_title = entry.find("div", attrs={"class": "cityblock-title"}).text.strip()

        try:
            # topic content
            page = session.get(topic_url)
            page.raise_for_status()
            topic_soup = BeautifulSoup(page.text, "html.parser")
            body = topic_soup.find("div", attrs={"class": "oi-article-lt"})
            texts = [extract_text(tag) for tag in body.find_all(["p"])]
            content_text = "\n".join(filter(None, texts))

            data.append({"url": topic_url, "title": topic_title, "body": content_text})
            visited_urls.add(topic_url)

        except requests.HTTPError:
            tqdm.write(f"Failed to scrape: {topic_title} - Error: Bad response status")

        except Exception as e:
            tqdm.write(f"Failed to scrape: {topic_title} - Error: {str(e)}")

tqdm.write("Scraping completed.")

driver.quit()

Scrolling to load more topics... Please wait
Scrolled to display 100 topics. Proceeding with scraping..


                                                                    

Scraping completed.


In [20]:
with open("tamil-oneindia.json", "w") as json_file:
    json.dump(data, json_file, indent=2)