# Clark University Catalog Link Extractor

This notebook extracts all links from the Clark University catalog page.

In [3]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [4]:
# Define the URL to scrape
url = "https://catalog.clarku.edu/content.php?catoid=34&navoid=2847&print"

# Function to fetch the webpage content
def fetch_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise exception for HTTP errors
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None

In [5]:
# Fetch the page content
html_content = fetch_page(url)

if html_content:
    print("Page fetched successfully!")
else:
    print("Failed to fetch page.")

Page fetched successfully!


In [6]:
# Parse HTML and extract links
def extract_links(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []
    
    # Find all anchor tags
    for a_tag in soup.find_all('a', href=True):
        link = {
            'text': a_tag.text.strip(),
            'href': a_tag['href']
        }
        links.append(link)
    
    return links

# Extract links from the page
if html_content:
    links = extract_links(html_content)
    print(f"Found {len(links)} links on the page")
else:
    links = []

Found 377 links on the page


In [7]:
# Create a DataFrame to display and analyze the links
if links:
    df_links = pd.DataFrame(links)
    
    # Display first 10 links
    df_links.head(10)

In [8]:
# Filter links that contain specific patterns (e.g., course links)
if links:
    # Example: Filter links that might be course-related
    course_links = [link for link in links if 'preview_course' in link['href'] or 'preview_program' in link['href']]
    
    print(f"Found {len(course_links)} potential course/program links")
    
    # Display them as a DataFrame
    if course_links:
        pd.DataFrame(course_links).head()

Found 186 potential course/program links


In [9]:
# Fix relative URLs by adding the base URL
def fix_relative_urls(links, base_url="https://catalog.clarku.edu/"):
    for link in links:
        if link['href'] and not (link['href'].startswith('http://') or link['href'].startswith('https://')):
            if link['href'].startswith('/'):
                link['href'] = base_url.rstrip('/') + link['href']
            else:
                link['href'] = base_url.rstrip('/') + '/' + link['href']
    return links

# Fix relative URLs
if links:
    links = fix_relative_urls(links)
    
    # Show updated links
    pd.DataFrame(links).head(5)

In [10]:
# Save links to CSV
if links:
    df_links = pd.DataFrame(links)
    csv_filename = "clarku_catalog_links.csv"
    df_links.to_csv(csv_filename, index=False)
    print(f"Links saved to {csv_filename}")

Links saved to clarku_catalog_links.csv


## Summary

This notebook has:
1. Fetched the Clark University catalog page
2. Extracted all links from the page
3. Fixed relative URLs to absolute URLs
4. Saved the links to a CSV file

You can further filter or analyze these links as needed for your specific use case.