In [30]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
import json
import time

# Base URL of the blog page to scrape, with pagination handled by adding '?page='
base_url = "https://www.kaggle.com/datasets/sahilkirpekar/bbcnews-dataset"

# Function to scrape articles without Selenium
def scrape_articles():
    articles = []
    page = 1  # Start from the first page

    while True:
        # Fetch page content
        response = requests.get(base_url + str(page))
        if response.status_code != 200:
            break  # Stop if no more pages

        # Parsing the page content
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract articles from the page
        article_elements = soup.select('.article')  # Adjust selector based on website structure
        if not article_elements:
            break  # Exit if no articles found (end of pagination)

        for article in article_elements:
            title = article.select_one('.title').get_text(strip=True)
            date = article.select_one('.date').get_text(strip=True)
            author = article.select_one('.author').get_text(strip=True)
            content = article.select_one('.content').get_text(strip=True)
            
            articles.append({
                "title": title,
                "date": date,
                "author": author,
                "content": content
            })

        page += 1  # Move to the next page
        time.sleep(1)  # Add delay to avoid overloading the server

    return articles

# Running the scraper function and saving data
articles = scrape_articles()

# Save the scraped articles to a JSON file
with open("scraped_articles.json", "w") as f:
    json.dump(articles, f, indent=2)


In [27]:
# Importing necessary libraries
import re
import pandas as pd

# Loading scraped data
with open("scraped_articles.json", "r") as f:
    articles = json.load(f)

# Function to clean data
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

# Cleaning each article's content
cleaned_articles = []
for article in articles:
    article['content'] = clean_text(article['content'])
    cleaned_articles.append(article)

# Saving cleaned data to a DataFrame and exporting as CSV
df = pd.DataFrame(cleaned_articles)
df.to_csv("cleaned_articles.csv", index=False)


In [21]:
# Importing libraries for topic classification
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Preparing the data for classification
X = df['content']
y = df['category']  # Replace with actual categories if available

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizing text data
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model training with Naive Bayes
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Making predictions and evaluating
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print(f"Classification Accuracy: {accuracy * 100:.2f}%")


KeyError: 'content'