<a href="https://colab.research.google.com/github/mltngpot/Describer/blob/matt-parser/Project3_HTMLParser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import requests
import csv
from bs4 import BeautifulSoup
import time

# Function to process a single URL
def process_article(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {url}. Status code: {response.status_code}")
        return None, None, []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the title
    title = soup.find('h1', id='main-heading')
    article_title = title.text.strip() if title else "Title not found"

    # Extract the article text from class "post-content__body"
    content_div = soup.find('div', class_='post-content__body')
    article_text = ""
    if content_div:
        paragraphs = [p.text.strip() for p in content_div.find_all(['p', 'li'])]
        article_text = "\n".join(paragraphs)

    # Extract image metadata from class "post-content__thumbnail"
    thumbnail_div = soup.find('div', class_='post-content__thumbnail')
    images_data = []
    if thumbnail_div:
        for idx, img in enumerate(thumbnail_div.find_all('img'), start=1):
            image_url = img.get('src')
            if image_url:  # Ensure 'src' exists
                alt_text = img.get('alt', f"This is a value for image {idx}")
                images_data.append((image_url, alt_text))
    return article_title, article_text, images_data

# Function to loop over URLs from the CSV file
def process_articles_from_csv(input_csv, article_output_csv, image_output_csv, wait_time=2):
    with open(input_csv, 'r') as csvfile:
        reader = csv.DictReader(csvfile)

        # Prepare output files
        with open(article_output_csv, 'w', newline='', encoding='utf-8') as article_file, \
             open(image_output_csv, 'w', newline='', encoding='utf-8') as image_file:

            # Write headers for output CSVs
            article_writer = csv.writer(article_file)
            article_writer.writerow(["Title", "Article Text"])  # Article metadata header

            image_writer = csv.writer(image_file)
            image_writer.writerow(["file_url", "additional_feature"])  # Image metadata header
            i = 1
            # Process each row in the input CSV
            for row in reader:
                if row['Year'] == '2023':  # Filter for articles from 2023
                    url = row['URL']
                    print(f"{i}. Processing URL: {url}")
                    i += 1
                    article_title, article_text, images_data = process_article(url)

                    # Write article metadata
                    if article_title and article_text:
                        article_writer.writerow([article_title, article_text])

                    # Write image metadata
                    if images_data:
                        for image_url, alt_text in images_data:
                            image_writer.writerow([image_url, alt_text])

                    # Wait before the next request
                    #print(f"Waiting for {wait_time} seconds...")
                    time.sleep(wait_time)

# File paths
input_csv = "purdue_articles.csv"          # Input CSV with Year and URL
article_output_csv = "article_metadata.csv"  # Output CSV for articles
image_output_csv = "image_metadata.csv"      # Output CSV for image metadata

# Run the processing with a 2-second wait
process_articles_from_csv(input_csv, article_output_csv, image_output_csv, wait_time=5)


1. Processing URL: https://www.purdue.edu/newsroom/2023/Q4/purdue-president-mung-chiangs-end-of-year-wrap-up-on-university-successes
2. Processing URL: https://www.purdue.edu/newsroom/purduetoday/2023/Q4/chl-program-focuses-on-taking-control-of-type-2-diabetes-prediabetes-register-by-jan-10
3. Processing URL: https://www.purdue.edu/newsroom/purduetoday/2023/Q4/stay-safe-and-injury-free-this-winter-top-tips-for-preventing-falls-and-walking-safely
4. Processing URL: https://www.purdue.edu/newsroom/purduetoday/2023/Q4/employees-can-now-connect-with-their-purdue-retirement-program-on-fidelity-netbenefits
5. Processing URL: https://www.purdue.edu/newsroom/purduetoday/2023/Q4/virtual-healthkick-program-focuses-on-physical-activity-nutrition-more-register-by-jan-3
6. Processing URL: https://www.purdue.edu/newsroom/2023/Q4/purdue-reputation-in-space-brings-better-understanding-of-the-stars-planets-and-everything-in-between
7. Processing URL: https://www.purdue.edu/newsroom/purduetoday/2023/Q4/