In [1]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [2]:
!pip install requests



In [3]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Bangla

In [5]:
def tbsnews_bangla_links(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  link_elements = soup.find_all(['h3', 'h2'], class_='card-title')
  base_url = 'https://www.tbsnews.net'
  links = [urljoin(base_url, element.a['href']) for element in link_elements]
  return links

In [6]:
def tbsnews_bangla_headline(soup):
  h1_headline = soup.find('h1', class_='hide-for-small-only', itemprop='headline')

  if h1_headline:
    return h1_headline.get_text(strip=True)

  return None

In [7]:
def tbsnews_bangla_summary(soup):
  summary_div = soup.find('div', class_='intro')

  if summary_div:
    summary_text = summary_div.get_text(separator=' ').rstrip().replace('\n', '').replace('\r', '')
    return summary_text

  return None

In [8]:
def tbsnews_bangla_article(soup):
  paragraphs = soup.find_all('p', class_='rtejustify')
  content = []
  for p_element in paragraphs:
    paragraph_text = p_element.get_text(separator='\n').strip()
    content.append(paragraph_text)

  section_contents = soup.find_all('div', class_='section-content margin-bottom-2')
  for div_element in section_contents:
    paragraph_text = div_element.get_text(separator='\n').strip()
    content.append(paragraph_text)

  if content:
    article_text = ' '.join(content)
    article_text = ' '.join(article_text.split())
    return article_text

  return None

In [9]:
def tbsnews_bangla(urls):
  data = []

  for article_url in urls:
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    headline = tbsnews_bangla_headline(soup)
    summary = tbsnews_bangla_summary(soup)
    article = tbsnews_bangla_article(soup)

    if None in [article_url, headline, summary, article]:
      print(f'Failed to retrieve content from {article_url}')
      print([headline, summary, article])
    else:
        data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
        print(f'Successfully scraped content from {article_url}')

  return data

In [10]:
def scrape_articles(base_url, max_pages=None):
  all_article_links = []
  page = 0
  while max_pages is None or page <= max_pages:
    url = f'{base_url}?page={page}'
    article_links = tbsnews_bangla_links(url)

    if not article_links:
      break

    all_article_links.extend(article_links)
    page += 1

  data = tbsnews_bangla(all_article_links)
  return data

In [11]:
def write_to_csv(data, filename, fields):
  with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

  print(f'Data from all pages has been saved to {filename}')

In [12]:
tbnews_base_url = 'https://www.tbsnews.net/bangla/%E0%A6%85%E0%A6%B0%E0%A7%8D%E0%A6%A5%E0%A6%A8%E0%A7%80%E0%A6%A4%E0%A6%BF'
filename = '/content/drive/MyDrive/tbsnews_bangla_economic_news_dataset.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']

data = scrape_articles(tbnews_base_url, max_pages=None)
write_to_csv(data, filename, fields)

Failed to retrieve content from https://www.tbsnews.net/bangla/%E0%A6%85%E0%A6%B0%E0%A7%8D%E0%A6%A5%E0%A6%A8%E0%A7%80%E0%A6%A4%E0%A6%BF/news-details-206611
['শীঘ্রই এফটিএ স্বাক্ষরের জন্য সম্ভাব্যতা সমীক্ষা সম্পন্ন করবে বাংলাদেশ-চীন', None, "চীনের সঙ্গে মুক্ত বাণিজ্য চুক্তি (এফটিএ) স্বাক্ষরের লক্ষ্যে খসড়া সম্ভাব্যতা সমীক্ষা প্রতিবেদন বিনিময় করেছে বাংলাদেশ। অতিদ্রুত সম্ভাব্যতা সমীক্ষা চূড়ান্ত করতে জয়েন্ট ওয়ার্কিং গ্রুপ ঢাকায় মিলিত হবে বলে জানিয়েছেন সংশ্লিষ্ট কর্মকর্তারা। বৃহস্পতিবার (২৮ মার্চ) বাণিজ্য মন্ত্রণালয়ের সম্মেলন কক্ষে এক অনুষ্ঠানে এ সমীক্ষা প্রতিবেদন বিনিময় করে দুই দেশ। এ সময় বাণিজ্য মন্ত্রণালয়ের সিনিয়র সচিব তপন কান্তি ঘোষ ও বাংলাদেশে নিযুক্ত চীনের রাষ্ট্রদূত ইয়াও ওয়েনসহ উভয়পক্ষের সংশ্লিষ্ট কর্মকর্তারা উপস্থিত ছিলেন। সভায় বাণিজ্য সচিব বলেন, চীন বিশ্বের দ্বিতীয় বৃহৎ অর্থনীতির দেশ এবং বাংলাদেশের গুরুত্বপূর্ণ বাণিজ্যিক অংশীদার। বাংলাদেশের বাণিজ্য সম্প্রসারণের জন্য চীন একটি বাণিজ্য সম্ভাবনাময় এলাকা। তিনি বলেন, পণ্য ছাড়াও সেবা ও বিনিয়োগ খাতেও চীনের সঙ্গে বাংলাদেশের সম্পর্ক উন্নয়নে

# English

In [None]:
def tbsnews_english_links(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')
  link_elements = soup.find_all(['h3', 'h2'], class_='card-title')
  base_url = 'https://www.tbsnews.net'
  links = [urljoin(base_url, element.a['href']) for element in link_elements]
  return links

In [None]:
def tbsnews_english_headline(soup):
  h1_headline = soup.find('h1', class_='hide-for-small-only', itemprop='headline')

  if h1_headline:
    return h1_headline.get_text(strip=True)

  return None

In [None]:
def tbsnews_english_summary(soup):
  summary_div = soup.find('h2', class_='intro')

  if summary_div:
    summary_text = summary_div.get_text(separator=' ').rstrip().replace('\n', '').replace('\r', '')
    return summary_text

  return None

In [None]:
def tbsnews_english_article(soup):
    section_div = soup.find('div', class_='section-content clearfix margin-bottom-2')
    if section_div:
        paragraphs = section_div.find_all('p')
        content = [p.get_text(separator='\n').strip() for p in paragraphs]

        if content:
            article_text = ' '.join(content)
            article_text = ' '.join(article_text.split())
            return article_text

    return None

In [None]:
def tbsnews_english(urls):
  data = []

  for article_url in urls:
    response = requests.get(article_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    headline = tbsnews_english_headline(soup)
    summary = tbsnews_english_summary(soup)
    article = tbsnews_english_article(soup)

    if None in [article_url, headline, summary, article]:
      print(f'Failed to retrieve content from {article_url}')
      print([headline, summary, article])
    else:
        data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
        print(f'Successfully scraped content from {article_url}')

  return data

In [None]:
def scrape_english_articles(base_url, max_pages=None):
  all_article_links = []
  page = 0

  while max_pages is None or page <= max_pages:
    url = f'{base_url}?page={page}'
    article_links = tbsnews_english_links(url)

    if not article_links:
      break

    all_article_links.extend(article_links)
    page += 1

  data = tbsnews_english(all_article_links)
  return data

In [None]:
tbnews_base_english_url = "https://www.tbsnews.net/economy"
filename = '/content/drive/MyDrive/tbnews_english_economic_news_dataset.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']

data = scrape_english_articles(tbnews_base_english_url, max_pages=None)
write_to_csv(data, filename, fields)

Failed to retrieve content from https://www.tbsnews.net/economy/banking/independent-directors-family-barred-holding-shares-bank-793150
["Independent director's family barred from holding shares in the bank", None, 'To ensure good governance, the Bangladesh Bank has made it mandatory for an independent director to be between 45 and 75 years old, and none of their family members can hold shares in the bank. "If any complaint is proved against an independent director, the Bangladesh Bank will take action against him as per the Bank Company Act," according to a circular issued by the central bank today. In addition to having 10 years of management or professional experience, independent bank directors must hold a master\'s or post-master\'s degree in economics, banking, finance, business administration, law, or accounting from a recognised university. Additionally, independent directors cannot be tax defaulters or declared bankrupt by a court. A person listed as a willful defaulter is inel