In [None]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
!pip install requests



In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import re
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def bbc_bangla_links(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  article_links = [a['href'] for a in soup.find_all('a', class_='focusIndicatorDisplayBlock bbc-uk8dsi e1d658bg0')]

  if article_links:
    return article_links

  return None

In [None]:
def bbc_bangla_headline(soup):
  strong_headline = soup.find('strong', class_='ewk8wmc0 bbc-fl6ui8 eglt09e1')
  h1_headline = soup.find('h1', class_='bbc-qa2lun e1p3vdyi0')

  if strong_headline:
    return strong_headline.get_text(strip=True)
  if h1_headline:
      return h1_headline.get_text(strip=True)

  return None

In [None]:
def bbc_bangla_summary(soup):
  div_elements = soup.find_all('div', class_='bbc-19j92fr ebmt73l0')
  summary = []

  for div_element in div_elements:
    p_element = div_element.find('p', class_='bbc-12k5sdr e17g058b0')

    if p_element:
      if p_element.find('b'):
        summary.append(p_element.get_text(separator=' ').rstrip().replace('\n', ''))
      else:
        break

  if summary:
    summary_text = ' '.join(summary)
    return summary_text

  return None

In [None]:
def bbc_bangla_article(soup):
  div_elements = soup.find_all('div', class_='bbc-19j92fr ebmt73l0')
  content = []

  for div_element in div_elements:
    h2_element = div_element.find('h2', class_='bbc-8zh0ns eglt09e0')
    p_element = div_element.find('p', class_='bbc-12k5sdr e17g058b0')

    if h2_element:
      content.append(h2_element.get_text(separator=' ').rstrip())
    if p_element:
      if not p_element.find('b'):
        paragraph_text = p_element.get_text(separator=' ').rstrip().replace('\n', ' ')
        content.append(paragraph_text)

  if content:
    article_text = ' '.join(content)
    article_text = ' '.join(article_text.split())
    return article_text

  return None

In [None]:
def bbc_bangla(urls):
  data = []

  for article_url in urls:
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    headline = bbc_bangla_headline(soup)
    summary = bbc_bangla_summary(soup)
    article = bbc_bangla_article(soup)

    if None in [article_url, headline, summary, article]:
      print(f'Failed to retrieve content from {article_url}')
    else:
        data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
        print(f'Successfully scraped content from {article_url}')

  return data

In [None]:
def write_to_csv(data, filename, fields):
  with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

  print(f'Data from all pages has been saved to {filename}')

In [None]:
bbc_bangla_url = 'https://www.bbc.com/bengali/topics/cjgn7233zk5t'
num_of_pages = 29
data = []
filename = '/content/drive/MyDrive/bbc_bangla_economic_news_dataset.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']

In [None]:
for page_num in range(1, num_of_pages + 1):
  url = f'{bbc_bangla_url}?page={page_num}'
  links = bbc_bangla_links(url)
  data.extend(bbc_bangla(links))

write_to_csv(data, filename, fields)

Successfully scraped content from https://www.bbc.com/bengali/articles/cv2x4mxlnmxo
Successfully scraped content from https://www.bbc.com/bengali/articles/cq51xlgq9yyo
Successfully scraped content from https://www.bbc.com/bengali/articles/cv2m29gj114o
Successfully scraped content from https://www.bbc.com/bengali/articles/cner9kjepd4o
Successfully scraped content from https://www.bbc.com/bengali/articles/cx0j3j44p0go
Failed to retrieve content from https://www.bbc.com/bengali/articles/cgxj594y8g0o
Successfully scraped content from https://www.bbc.com/bengali/articles/c51z8pmlnnro
Successfully scraped content from https://www.bbc.com/bengali/articles/cv2lvek1rero
Successfully scraped content from https://www.bbc.com/bengali/articles/c28ynvrxmp0o
Successfully scraped content from https://www.bbc.com/bengali/articles/cpdl2dlp3pyo
Successfully scraped content from https://www.bbc.com/bengali/articles/c8827n9ee9jo
Failed to retrieve content from https://www.bbc.com/bengali/articles/c8827xzpe

In [None]:
def scrape_news(api_url):
    response = requests.get(api_url)
    data = response.json()
    news_items = data['payload'][0]['body']['results']

    for news_item in news_items:
        link = f"https://www.bbc.co.uk{news_item['url']}"

        if 'business' not in link.lower():
            continue

        html_response = requests.get(link)
        html_content = html_response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        bold_text = soup.find('b', class_='ssrcss-hmf8ql-BoldText')
        if bold_text:
            bold_text_summary = bold_text.get_text(strip=True)

            h1_heading = soup.find('h1', class_='ssrcss-15xko80-StyledHeading e10rt3ze0')
            headline = h1_heading.text if h1_heading else None

            print(f"\nLink: {link}")
            if headline:
                print(f"Headline: {headline}")
            print(f"Bold Text Summary: {bold_text_summary}")

        div_paragraphs = soup.find_all('div', class_='ssrcss-11r1m41-RichTextComponentWrapper ep2nwvo0')
        skip_first_paragraph = True

        for div in div_paragraphs:
            if div.get('data-component') == 'text-block':
                paragraphs = div.find_all('p', class_='ssrcss-1q0x1qg-Paragraph e1jhz7w10')
                for paragraph in paragraphs:
                    if skip_first_paragraph:
                        skip_first_paragraph = False
                        continue

                    for tag in paragraph.find_all('i'):
                        tag.decompose()

                    h2_heading = div.find('h2', class_='ssrcss-y2fd7s-StyledHeading e10rt3ze0')
                    h2_content = h2_heading.text if h2_heading else None

                    news_article = paragraph.get_text()
                    print(f"News Article: {news_article}")
                    if h2_content:
                        print(f"h2 Content: {h2_content}")

In [None]:
api_urls = [
    "https://push.api.bbci.co.uk/batch?t=/data/bbc-morph-lx-commentary-data-paged/about/19a1d11b-1755-4f97-8747-0c9534336a47/isUk/false/limit/20/nitroKey/lx-nitro/pageNumber/2/version/1.5.6?timeout=5"
]

for api_url in api_urls:
    scrape_news(api_url)