In [None]:
!pip install bs4

Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [None]:
!pip install requests



In [None]:
import requests
from bs4 import BeautifulSoup
import csv
from urllib.parse import urljoin
import re
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Bangla

In [None]:
api_url = "https://www.ittefaq.com.bd/api/theme_engine/get_ajax_contents?widget=459&start={}&count=20&page_id=1094&subpage_id=0&author=0&tags=&archive_time=&filter="

In [None]:
def extract_news_data(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    news_data = []

    for title_holder in soup.find_all('div', class_='title_holder'):
        title_tag = title_holder.find('h2', class_='title')
        link_tag = title_holder.find('a', class_='link_overlay')
        if title_tag and link_tag:
            headline = title_tag.text.strip()
            url = 'https:' + link_tag.get('href')
            news_data.append({'headline': headline, 'url': url})

    return news_data

In [None]:
def get_news_data(api_url):
    news_data = []

    for start in range(0, 4400, 20):
        url = api_url.format(start)
        response = requests.get(url)

        if response.status_code == 200:
            data = json.loads(response.text)
            html_content = data['html']
            news_items = extract_news_data(html_content)
            news_data.extend(news_items)
        else:
            print(f"Error fetching data from {url}. Status code: {response.status_code}")

    return news_data

In [None]:
def ittefaq_summary(soup):
    div_elements = soup.find_all('div', class_='viewport jw_article_body', itemprop='articleBody')
    summary = []

    for div_element in div_elements:
        p_element = div_element.find('p')

        if p_element:
            if p_element.find('strong'):
                summary.append(p_element.get_text(separator=' ').rstrip().replace('\n', ' '))
            else:
                break

    if summary:
        summary_text = ' '.join(summary)
        return summary_text

    return None

In [None]:
def ittefaq_article(soup):
    article_elements = soup.find_all('div', class_='viewport jw_article_body', itemprop='articleBody')
    content = []

    for article_element in article_elements:
        paragraph_elements = article_element.find_all('p')

        for paragraph_element in paragraph_elements:
            if not paragraph_element.find('strong'):
                paragraph_text = paragraph_element.get_text(separator=' ').rstrip().replace('\n', ' ')
                content.append(paragraph_text)

    if content:
        article_text = ' '.join(content).strip()
        return article_text

    return None

In [None]:
def ittefaq(urls):
    data = []

    for article_info in urls:
        article_url = article_info['url']
        response = requests.get(article_url)
        soup = BeautifulSoup(response.content, 'html.parser')
        headline = article_info['headline']
        summary = ittefaq_summary(soup)
        article = ittefaq_article(soup)

        if None in [article_url, headline, summary, article]:
            print(f'Failed to retrieve content from {article_url}')
            print([headline, summary, article])
        else:
            data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
            print(f'Successfully scraped content from {article_url}')

    return data

In [None]:
def write_to_csv(data, filename, fields):
  with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()
    writer.writerows(data)

  print(f'Data from all pages has been saved to {filename}')

In [None]:
news_data_links = get_news_data(api_url)
data = ittefaq(news_data_links)
filename = '/content/drive/MyDrive/ittefaq_bangla.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']
write_to_csv(data, filename, fields)

Output hidden; open in https://colab.research.google.com to view.

# English

In [None]:
def ittefaq_headline(soup):
    h1_element = soup.find('h1', class_='title mb10')
    return h1_element.get_text(strip=True) if h1_element else None

In [None]:
def ittefaq_en_summary(soup):
    div_elements = soup.find_all('div', class_='viewport jw_article_body', itemprop='articleBody')
    summary = []

    for div_element in div_elements:
        p_element = div_element.find('p')

        if p_element:
            if p_element.find('strong'):
                summary.append(p_element.get_text(separator=' ').rstrip().replace('\n', ''))
            else:
                break

    if summary:
        summary_text = ' '.join(summary)
        summary_text = ' '.join(summary_text.split())
        return summary_text

    return None

In [None]:
def ittefaq_en_article(soup):
    article_elements = soup.find_all('div', class_='viewport jw_article_body', itemprop='articleBody')
    content = []

    for article_element in article_elements:
        paragraph_elements = article_element.find_all('p')

        for paragraph_element in paragraph_elements:
            if not paragraph_element.find('strong'):
                paragraph_text = paragraph_element.get_text(separator=' ').rstrip().replace('\n', ' ')
                content.append(paragraph_text)

    if content:
        article_text = ' '.join(content).strip()
        article_text = ' '.join(article_text.split())
        return article_text

    return None

In [None]:
def scrape_news(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json().get('html', '')
        soup = BeautifulSoup(data, 'html.parser')
        base_url = response.url
        return [urljoin(base_url, a['href']) for a in soup.find_all('a', class_='link_overlay')]
    else:
        print(f"Failed to fetch data from {api_url}. Status code: {response.status_code}")

In [None]:
def scrape_news(api_url):
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json().get('html', '')
        soup = BeautifulSoup(data, 'html.parser')
        base_url = response.url
        return [urljoin(base_url, a['href']) for a in soup.find_all('a', class_='link_overlay')]
    else:
        print(f"Failed to fetch data from {api_url}. Status code: {response.status_code}")

In [None]:
def scrape_articles(api_url_pattern, num_pages):
    data = []

    for start in range(0, num_pages * 9 + 1, 9):
        api_url = f"{api_url_pattern}{start}&count=9&page_id=1094&subpage_id=0&author=0&tags=&archive_time=&filter="
        news_urls = scrape_news(api_url)

        for article_url in news_urls:
            response = requests.get(article_url)
            soup = BeautifulSoup(response.content, 'html.parser')
            headline = ittefaq_headline(soup)
            summary = ittefaq_en_summary(soup)
            article = ittefaq_en_article(soup)

            if None in [article_url, headline, summary, article]:
                print(f'Failed to retrieve content from {article_url}')
                print([headline, summary, article])
            else:
                data.append({'URL': article_url, 'Headline': headline, 'Summary': summary, 'Article': article})
                print(f'Successfully scraped content from {article_url}')

    return data

In [None]:
def write_to_csv(data, filename, fields):
    with open(filename, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)
        writer.writeheader()
        writer.writerows(data)

    print(f'Data from all pages has been saved to {filename}')

In [None]:
api_url_pattern = "https://en.ittefaq.com.bd/api/theme_engine/get_ajax_contents?widget=28&start="
num_pages = 8

data = scrape_articles(api_url_pattern, num_pages)

filename = '/content/drive/MyDrive/ittefaq_english_business_news_dataset.csv'
fields = ['URL', 'Headline', 'Summary', 'Article']
write_to_csv(data, filename, fields)