In [None]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
!pip install requests



In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def extract_article_content(url, div_class_name, skip_class_name):
    try:
        with requests.get(url) as response:
            response.raise_for_status()
            html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')
        skip_elements = soup.find_all('div', class_=skip_class_name)

        for skip_element in skip_elements:
            skip_element.decompose()

        div_elements = soup.find_all('div', class_=div_class_name)
        div_texts = [re.sub(r'\s+', ' ', div.get_text(separator=' ').strip()) for div in div_elements]
        return ' '.join(div_texts)

    except requests.exceptions.RequestException as e:
        return f"Error fetching HTML content: {e}"

In [16]:
def scrape_articles(offset, limit, div_class_name, skip_class_name):
    data_list = []
    article_number = 1

    while article_number <= 2000:
        url = f"https://bangla.bdnews24.com/api/v1/collections/105128?item-type=story&offset={offset}&limit={limit}"

        try:
            with requests.get(url) as response:
                response.raise_for_status()
                data = response.json()
            items = data.get('items', [])

            if not items:
                print("No more items. Breaking the loop.")
                break

            for item in items:
                story = item.get('story', {})
                headline, subheadline, slug = story.get('headline', ''), story.get('subheadline', ''), story.get('slug', '')

                if headline and slug:
                    article_url = f"https://bangla.bdnews24.com/{slug}"
                    div_content = extract_article_content(article_url, div_class_name, skip_class_name)

                    data_list.append({
                        'URL': article_url,
                        'Headline': headline,
                        'Summary': subheadline,
                        'Article': div_content
                    })

                    print(f"Article {article_number}: Successfully scraped content for: {article_url}")
                    article_number += 1

                    if article_number > 2000:
                        print("Reached the limit of 20000 articles. Stopping.")
                        break

            offset += limit

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"Error fetching or decoding data: {e}")
            break

    return data_list

In [19]:
dataset = scrape_articles(0, 10, 'story-element-text', 'story-element-text-also-read')
df = pd.DataFrame(dataset)
df.to_csv('/content/drive/MyDrive/bdnews24_financial_news_dataset.csv', index=False, encoding='utf-8')

Article 1: Successfully scraped content for: https://bangla.bdnews24.com/business/uhk88mxm1q
Article 2: Successfully scraped content for: https://bangla.bdnews24.com/business/3yyibpn00s
Article 3: Successfully scraped content for: https://bangla.bdnews24.com/business/eaxq7med9p
Article 4: Successfully scraped content for: https://bangla.bdnews24.com/business/u02ataizh8
Article 5: Successfully scraped content for: https://bangla.bdnews24.com/business/p439wot1vn
Article 6: Successfully scraped content for: https://bangla.bdnews24.com/business/llo8w95nf1
Article 7: Successfully scraped content for: https://bangla.bdnews24.com/business/pums2rvhqt
Article 8: Successfully scraped content for: https://bangla.bdnews24.com/business/69pt7bfix2
Article 9: Successfully scraped content for: https://bangla.bdnews24.com/business/kpfy2m6dvo
Article 10: Successfully scraped content for: https://bangla.bdnews24.com/business/glhjuluzvl
Article 11: Successfully scraped content for: https://bangla.bdnews24