In [5]:
from google.colab import drive
import requests
from bs4 import BeautifulSoup
import csv

In [6]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
def scrape_news_data(api_url_template, last_page):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    rows = []

    for page in range(last_page + 1):
        api_url = f"{api_url_template}/{page}/20"
        response = requests.get(api_url, headers=headers)

        if response.status_code == 200:
            news_data = response.json()

            for news_item in news_data:
                headline = news_item.get('headline', '')
                url = news_item.get('url', '')
                article_url = news_item.get('url', '')
                article_summary, article_content = get_article_content(article_url, headers)

                if article_summary and article_content:
                    article_content = article_content.replace('\n', ' ').replace('\r', '')
                    rows.append([url, headline, article_summary, article_content])
                    print(url)

            print(page)

        else:
            print(f"Failed to fetch data for page {page}. Status Code: {response.status_code}")

    save_to_csv(rows, '/content/drive/My Drive/protidinerbangladesh.csv')

In [12]:
def get_article_content(article_url, headers):
    response = requests.get(article_url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        article_content_div = soup.find('div', class_='newsBody')
        if article_content_div:
            paragraphs = article_content_div.find_all('p')
            article_summary = ""
            article_sections = []

            for paragraph in paragraphs:
                if paragraph.find('b') or paragraph.find('strong'):
                    article_summary += paragraph.get_text(strip=True) + ' '
                else:
                    article_sections.append(paragraph.get_text(strip=True))

            article_content = ' '.join(article_sections)
            article_summary = ' '.join(article_summary.split())
            article_content = ' '.join(article_content.split())
            return article_summary, article_content

    return "", ""

In [13]:
def save_to_csv(rows, file_path):
    with open(file_path, 'w', encoding='utf-8', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['URL', 'Headline', 'Summary', 'Article'])
        csv_writer.writerows(rows)

    print(f"Data saved to {file_path}")

In [14]:
api_url_template = "https://protidinerbangladesh.com/ajax/load/categorynews/12/20"
last_page = 277

scrape_news_data(api_url_template, last_page)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
https://protidinerbangladesh.com/business/72678/পেনিনসুলা-হোটেলে-ইন্ডিয়ান-স্পাইস-ভয়েজ-ফুড-ফ্যাস্টিভ্যাল-শুরু
https://protidinerbangladesh.com/business/72647/জাতীয়-রপ্তানি-ট্রফি-অর্জন-করল-স্নোটেক্স-গ্রুপ
https://protidinerbangladesh.com/business/72576/কার-লাভ-কার-ক্ষতি
https://protidinerbangladesh.com/business/72545/আড়াই-শতাংশের-বেশি-রেমিট্যান্স-প্রণোদনা-না-দেওয়ার-দাবি
31
https://protidinerbangladesh.com/business/72542/শেষ-একনেকে-রেকর্ড-৪৪-প্রকল্প-অনুমোদন
https://protidinerbangladesh.com/business/72540/রপ্তানি-বাড়াতে-লালফিতার-দৌরাত্ম্য-দূর-করতে-হবে
https://protidinerbangladesh.com/national/72535/ধীরে-ধীরে-ভর্তুকি-প্রবণতা-থেকে-বেরিয়ে-আসার-নির্দেশনা-প্রধানমন্ত্রীর
https://protidinerbangladesh.com/country/72531/এক-সপ্তায়-ডিমের-দাম-কমলো-তিন-টাকা
https://protidinerbangladesh.com/business/72525/ন্যাশনাল-ব্যাংকের-তালতলা-উপশাখার-উদ্বোধন
https://protidinerbangladesh.com/business/72524/প্রাইম-ব্যাংকের-নীরা-এবং-কন্যা-ওয়েলবিং-লিমি