In [1]:
!pip install requests



In [2]:
!pip install beautifulsoup4



In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import csv

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def scrape_links(api_url, base_url):
    response = requests.get(api_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        anchor_tags = soup.find_all('div', class_='col-8')

        links = [urljoin(base_url, anchor_tag.find('a')['href']) for anchor_tag in anchor_tags]
        return links
    else:
        print(f"Failed to fetch data. Status Code: {response.status_code}")
        return []

In [6]:
def newsbangla_headline(soup):
    h1_headline = soup.find('h1', class_='mb-0')
    return h1_headline.get_text(strip=True) if h1_headline else None

In [7]:
def newsbangla_summary(soup):
    summary = soup.find('div', class_='details-highlight')
    if summary:
      return summary.get_text(strip=True)
    else:
      return None

In [8]:
def newsbangla_article(soup):
    target_div = soup.find('div', class_='article-content article-body-padding newsonlydetails')
    if target_div:
        paragraphs = target_div.find_all('p')
        return ' '.join(paragraph.get_text(strip=True) for paragraph in paragraphs) if paragraphs else None
    else:
        return None

In [9]:
def newsbangla(urls):
    data = []

    for i, article_url in enumerate(urls, start=1):
        response = requests.get(article_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        headline = newsbangla_headline(soup)
        summary = newsbangla_summary(soup)
        article = newsbangla_article(soup)

        if None in [article_url, headline, summary, article]:
            print(f'Failed to retrieve content from {article_url}')
        else:
            data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
            print(f'Successfully scraped content from Article {i} - {article_url}')

    return data

In [10]:
def write_to_csv(data, filename, fields):
  with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

  print(f'Data from all pages has been saved to {filename}')

In [11]:
base_url = "https://www.newsbangla24.com/"
start = 0

all_article_links = []
for start in range(0, 1000, 10):
    url = f"https://www.newsbangla24.com/online/type_pageinationAjax/14/{start}/239079"
    print(f"Scraping links from: {url}")
    all_article_links.extend(scrape_links(url, base_url))

Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/0/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/10/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/20/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/30/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/40/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/50/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/60/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/70/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/80/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/90/239079
Scraping links from: https://www.newsbangla24.com/online/type_pageinationAjax/14/100/239079

In [12]:
data = newsbangla(all_article_links)

filename = '/content/drive/MyDrive/newsbangla24_economic_news_dataset.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']
write_to_csv(data, filename, fields)

Successfully scraped content from Article 1 - https://www.newsbangla24.com/economy/242202/While-vegetables-remained-stable-prices-of-all-types-of-meat-rose
Successfully scraped content from Article 2 - https://www.newsbangla24.com/economy/242173/Reserves-fell-by-and534-million-over-the-week
Successfully scraped content from Article 3 - https://www.newsbangla24.com/economy/242133/Customers-of-Janata-Bank-Tamai-branch-also-have-money
Successfully scraped content from Article 4 - https://www.newsbangla24.com/economy/242131/The-salary-and-bonus-should-be-paid-to-the-workers-before-the-Eid-holiday
Successfully scraped content from Article 5 - https://www.newsbangla24.com/economy/242127/TCB-will-import-50-thousand-tons-of-onions-from-India
Successfully scraped content from Article 6 - https://www.newsbangla24.com/economy/242118/Import-of-1000-tons-of-potatoes-from-India-in-four-shipments
Successfully scraped content from Article 7 - https://www.newsbangla24.com/economy/242090/Cash-dollars-in