We'll use the requests library and BeautifulSoup, both standard for any scraping-related tasks. Since NYPost blocks Python requests we can use the fake_useragent library to bypass this.

In [47]:
import requests
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
import pandas as pd

In [27]:
ua = UserAgent()
headers = {'User-Agent': ua.random}

In [19]:
testurl = requests.get("https://nypost.com/search/congestion+pricing/", headers = headers)
souptest = BeautifulSoup(testurl.content, 'html.parser')

In [54]:
def nypost_scraper(keyword, max_pages):
    base_url = "https://nypost.com/search/"
    results = []
    for page in range(1, max_pages + 1):
        url = f"{base_url}{keyword}/page/{page}/"
        response = requests.get(url, headers=headers)
        if response.status_code!=200:
            print(f"Failed to fetch page {page}")
            break
        soup = BeautifulSoup(response.content, 'html.parser')
        story_texts = soup.find_all('div', class_='story__text')
        for story in story_texts:
            # Extract title and link
            headline = story.find('h3', class_='story__headline')
            if headline:
                title_tag = headline.find('a')
                title = title_tag.text.strip() if title_tag else "No title"
                link = title_tag['href'] if title_tag and title_tag.has_attr('href') else "No link"

            # Extract author and date
            meta = story.find('span', class_='meta meta--byline')
            if meta:
                # Split the content on the "|" character for date separation
                meta_parts = meta.text.strip().split('|')
                author = meta_parts[0].replace("By", "").strip() if len(meta_parts) > 0 else "No author"
                date = meta_parts[1].strip() if len(meta_parts) > 1 else "No date"
            else:
                author = "No author"
                date = "No date"
            # Extract excerpt
            excerpt_tag = story.find('p', class_='story__excerpt')
            excerpt = excerpt_tag.text.strip() if excerpt_tag else "No excerpt"

            # Append information to the list
            results.append({
                'title': title,
                'link': link,
                'author': author,
                'date': date,
                'excerpt': excerpt
            })
    return results

In [68]:
results = nypost_scraper("congestion+pricing",max_pages=20)
df = pd.DataFrame(data=results)

In [69]:
df = df.rename(columns={'date' : 'time', 'author' : 'name_date'})
df.head()

Unnamed: 0,title,link,name_date,time,excerpt
0,"Oregon effort to shift border, join conservati...",https://nypost.com/2025/02/17/us-news/eastern-...,"Charles Creitz, Fox News \t\t\t\tFebruary 17, ...",12:15pm,"""This movement has always been about the peopl..."
1,Luxury skincare sale! One of our favorite bran...,https://nypost.com/2025/02/17/shopping/shop-th...,"Victoria Giardina \t\t\t\tFebruary 17, 2025",7:00am,"Luxury, within reach."
2,NY's Gov. Hochul chimes in on E-ZPass texting ...,https://nypost.com/2025/02/16/us-news/hochul-c...,Carl Campanile and Jorge Fitz-Gibbon \t\t\t\tF...,5:22pm,Gov. Kathy Hochul chimed in and issued a warni...
3,Asian group tired of far-left Dems urges Andre...,https://nypost.com/2025/02/16/us-news/andrew-c...,"Carl Campanile \t\t\t\tFebruary 16, 2025",4:03pm,The Asian Wave Alliance advocates for merit-ba...
4,Homes prices and new developments are booming ...,https://nypost.com/2025/02/14/real-estate/home...,"David Christopher Kaufman \t\t\t\tFebruary 14,...",5:40pm,"The right track: It’s full steam ahead, as lux..."


In [70]:
df[['author', 'date']] = df['name_date'].str.split(r'\t+', expand=True)
df.drop(columns=['name_date'], inplace=True)

In [72]:
df.head()
df.to_csv('nyp_articles.csv')

# TODO: 
- Figure out a way to remove ads