In [5]:
import requests
import re
import os
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\onkar\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [19]:
categories = {
    "Technology News": [
        "https://techcrunch.com/",
        "https://www.theverge.com/",
        "https://www.wired.com/"
    ],
    "World News": [
        "https://www.bbc.com/news/world",
        "https://www.aljazeera.com/",
        "https://apnews.com/hub/world-news"
    ],
    "Sports": [
        "https://www.espn.com/",
        "https://www.skysports.com/",
        "https://www.bbc.com/sport"
    ],
    "Entertainment": [
        "https://variety.com/",
        "https://www.hollywoodreporter.com/",
        "https://ew.com/"
    ],
    "Science": [
        "https://www.scientificamerican.com/",
        "https://phys.org/",
        "https://www.livescience.com/"
    ],
    "Health": [
        "https://www.webmd.com/",
        "https://www.mayoclinic.org/",
        "https://www.health.com/"
    ],
    "Business": [
        "https://www.forbes.com/",
        "https://www.cnbc.com/",
        "https://www.ft.com/"
    ],
    "Politics": [
        "https://www.npr.org/sections/politics/",
        "https://www.bbc.com/news/politics",
        "https://fivethirtyeight.com/"
    ],
    "Environment": [
        "https://www.nationalgeographic.com/environment/",
        "https://www.ecowatch.com/",
        "https://www.treehugger.com/"
    ],
    "Education": [
        "https://www.edweek.org/",
        "https://www.insidehighered.com/",
        "https://www.chronicle.com/"
    ],
    "Travel": [
        "https://www.lonelyplanet.com/",
        "https://www.roughguides.com/",
        "https://www.cntraveler.com/"
    ],
    "Food": [
        "https://www.epicurious.com/",
        "https://www.allrecipes.com/",
        "https://www.bonappetit.com/"
    ],
    "Fashion": [
        "https://www.vogue.com/",
        "https://www.elle.com/",
        "https://www.harpersbazaar.com/"
    ],
    "Automotive": [
        "https://www.caranddriver.com/",
        "https://www.motortrend.com/",
        "https://www.autocar.co.uk/"
    ],
    "Real Estate": [
        "https://www.realtor.com/news/",
        "https://www.redfin.com/blog/",
        "https://www.trulia.com/blog/"
    ],
    "Personal Finance": [
        "https://www.nerdwallet.com/",
        "https://www.bankrate.com/",
        "https://www.investopedia.com/"
    ],
    "Gaming": [
        "https://www.ign.com/",
        "https://www.gamespot.com/",
        "https://kotaku.com/"
    ],
    "Parenting": [
        "https://www.parents.com/",
        "https://www.babycenter.com/",
        "https://www.whattoexpect.com/"
    ],
    "DIY and Crafts": [
        "https://www.instructables.com/",
        "https://www.apartmenttherapy.com/",
        "https://www.diyncrafts.com/"
    ],
    "Fitness": [
        "https://www.bodybuilding.com/",
        "https://www.menshealth.com/fitness/",
        "https://www.shape.com/fitness"
    ]
}


In [None]:
def scrape_website(url):
    headers = {'User-Agent': 'Mozilla/5.0'}
    try:
        response = requests.get(url, headers=headers, timeout=5)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None, None
    
    soup = BeautifulSoup(response.text, 'html.parser')

    
    title = soup.title.text if soup.title else "No Title"

    
    paragraphs = soup.find_all('p')
    content = ' '.join([p.text for p in paragraphs])

    return title, content


In [None]:
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  
    text = re.sub(r'[^a-zA-Z\s]', '', text) 
    text = text.lower()  
    text = ' '.join([word for word in text.split() if word not in stop_words])  
    return text


In [12]:
# Function to save data to a text file
def save_to_file(category, text):
    filename = f"{category}.txt"
    with open(filename, "w", encoding="utf-8") as file:
        file.write(text)

In [25]:
if not os.path.exists("WebText-20/raw_files"):
    os.makedirs("WebText-20/raw_files")

for category, urls in categories.items():
    print(f"\nScraping category: {category}")
    all_text = ""

    for url in urls:
        title, content = scrape_website(url)
        if content:
            all_text += f"\nTitle: {title}\nContent: {content}\n"

    raw_file_path = os.path.join("WebText-20/raw_files", f"{category}_raw")
    save_to_file(raw_file_path, all_text)

    cleaned_text = clean_text(all_text)

    cleaned_file_path = os.path.join("WebText-20", f"{category}_cleaned")
    save_to_file(cleaned_file_path, cleaned_text)

    print(f" Saved {category} data ({len(cleaned_text)} characters)")

print("\n Web scraping and text processing complete! Check the 'WebText-20' folder.")



Scraping category: Technology News
 Saved Technology News data (6375 characters)

Scraping category: World News
 Saved World News data (4030 characters)

Scraping category: Sports
 Saved Sports data (6330 characters)

Scraping category: Entertainment
 Saved Entertainment data (4772 characters)

Scraping category: Science
 Saved Science data (16062 characters)

Scraping category: Health
 Saved Health data (1636 characters)

Scraping category: Business
 Saved Business data (4981 characters)

Scraping category: Politics
 Saved Politics data (8550 characters)

Scraping category: Environment
 Saved Environment data (7829 characters)

Scraping category: Education
 Saved Education data (3451 characters)

Scraping category: Travel
 Saved Travel data (3415 characters)

Scraping category: Food
 Saved Food data (1423 characters)

Scraping category: Fashion
 Saved Fashion data (1571 characters)

Scraping category: Automotive
 Saved Automotive data (3710 characters)

Scraping category: Real Estate