In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [2]:
!pip install requests



In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import json

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def extract_article_content(url, div_class_name, skip_class_name):
    try:
        with requests.get(url) as response:
            response.raise_for_status()
            html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')
        skip_elements = soup.find_all('div', class_=skip_class_name)

        for skip_element in skip_elements:
            skip_element.decompose()

        div_elements = soup.find_all('div', class_=div_class_name)
        div_texts = [re.sub(r'\s+', ' ', div.get_text(separator=' ').strip()) for div in div_elements]
        return ' '.join(div_texts)

    except requests.exceptions.RequestException as e:
        return f"Error fetching HTML content: {e}"

In [9]:
def scrape_articles(collection_id, start_offset, limit, div_class_name, skip_class_name):
    data_list = []
    article_number = 1

    while article_number <= 10000:
        url = f"https://bdnews24.com/api/v1/collections/{collection_id}?item-type=story&offset={start_offset}&limit={limit}"

        try:
            with requests.get(url) as response:
                response.raise_for_status()
                data = response.json()
            items = data.get('items', [])

            if not items:
                print("No more items. Breaking the loop.")
                break

            for item in items:
                story = item.get('story', {})
                headline, subheadline, slug = story.get('headline', ''), story.get('subheadline', ''), story.get('slug', '')

                if headline and slug:
                    article_url = f"https://bdnews24.com/{slug}"
                    div_content = extract_article_content(article_url, div_class_name, skip_class_name)

                    data_list.append({
                        'URL': article_url,
                        'Headline': headline,
                        'Summary': subheadline,
                        'Article': div_content
                    })

                    print(f"Article {article_number}: Successfully scraped content for: {article_url}")
                    article_number += 1

                    if article_number > 10000:
                        print("Reached the limit of 10000 articles. Stopping.")
                        break

            start_offset += limit

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"Error fetching or decoding data: {e}")
            break

    return data_list

In [10]:
dataset = scrape_articles(104588, 0, 10, 'story-element-text', 'story-element-text-also-read')
df = pd.DataFrame(dataset)
df.to_csv('/content/drive/MyDrive/bdnews24_english_financial_news_dataset.csv', index=False, encoding='utf-8')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Article 530: Successfully scraped content for: https://bdnews24.com/economy/b7i65e8y28
Article 531: Successfully scraped content for: https://bdnews24.com/economy/c32po6zaez
Article 532: Successfully scraped content for: https://bdnews24.com/economy/l4hu4crgcz
Article 533: Successfully scraped content for: https://bdnews24.com/economy/7niawo5ip5
Article 534: Successfully scraped content for: https://bdnews24.com/economy/9larckx2sr
Article 535: Successfully scraped content for: https://bdnews24.com/economy/vo459d1so9
Article 536: Successfully scraped content for: https://bdnews24.com/economy/5hjo05sr38
Article 537: Successfully scraped content for: https://bdnews24.com/economy/r8qxaj9zyk
Article 538: Successfully scraped content for: https://bdnews24.com/economy/x4iao9lhsm
Article 539: Successfully scraped content for: https://bdnews24.com/economy/s0axyn4glv
Article 540: Successfully scraped content for: https://bdnews24.c