# Web Scraping Project

This notebook contains the web scraping process for extracting content from specified Northeastern University catalog pages, including their subpages. The content will be converted from HTML to Markdown format.

## Task 1: Scrape Content from Specified URLs and Subpages

This task involves:
- Downloading the main pages at the provided URLs.
- Identifying and downloading the subpages for each main URL.
- Converting each page's HTML content to Markdown format.

Each page is represented as a dictionary with two keys:
- `url`: URL of the page.
- `content`: Content of the page in Markdown format as a string.

In [22]:

import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
import re
import time


In [None]:

def get_page_content(url):
    """Fetch HTML content of the given URL and return as text."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Failed to retrieve {url}: {e}")
        return None

# Function to find subpage URLs within the page content
def find_subpages(main_url, html_content):
    """Extract subpage URLs that start with the main URL from HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')
    subpage_links = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.endswith('/'):
            href = 'https://catalog.northeastern.edu' + href
        # Only add links that start with the main URL
        if href.startswith(main_url) and not href.endswith('.pdf') and href not in subpage_links:
            subpage_links.add(href)
    return list(subpage_links)

def convert_to_markdown(html_content):
    """Convert HTML content to Markdown format."""
    return md(html_content)


# Recursive function for scraping main page and subpages
def scrape_recursive(url, visited_pages, scraped_pages):
    """Recursively scrape a page and its subpages."""
    if url in visited_pages:
        return  # Skip already visited pages

    print(f"Scraping: {url}")
    content = get_page_content(url)
    if content:
        page_data = {
            "url": url,
            "content": convert_to_markdown(content)
        }
        scraped_pages.append(page_data)  # Add scraped page to list
        visited_pages.add(url)  # Mark the page as visited

        # Find subpages and recurse into each one
        subpages = find_subpages(url, content)
        print(f"Found {len(subpages)} subpages for {url}")

        for subpage_url in subpages:
            scrape_recursive(subpage_url, visited_pages, scraped_pages)  # Recurse for each subpage
            time.sleep(1)  # Delay to avoid server overload

# Initial URLs and starting the recursive scraping
main_urls = [
    "https://catalog.northeastern.edu/undergraduate/computer-information-science/",
    "https://catalog.northeastern.edu/graduate/computer-information-science/"
]

# Sample URLs
main_urls = [
    "https://catalog.northeastern.edu/undergraduate/computer-information-science/",
    "https://catalog.northeastern.edu/graduate/computer-information-science/"
]


In [24]:

# Task 1: Download main pages, find subpages, and convert content to markdown

scraped_pages = []
visited_pages = set()

# Start recursive scraping for each main URL
for url in main_urls:
    scrape_recursive(url, visited_pages, scraped_pages)

print("Recursive scraping completed.")


Scraping: https://catalog.northeastern.edu/undergraduate/computer-information-science/
Found 0 subpages for https://catalog.northeastern.edu/undergraduate/computer-information-science/
Scraping: https://catalog.northeastern.edu/graduate/computer-information-science/
Found 0 subpages for https://catalog.northeastern.edu/graduate/computer-information-science/
Recursive scraping completed.


In [21]:

# Display first few results to verify
scraped_pages  # Display first two items as a sample


[{'url': 'https://catalog.northeastern.edu/undergraduate/computer-information-science/',
 {'url': 'https://catalog.northeastern.edu/graduate/computer-information-science/',