In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
def fetch_article_links(url):
    """Fetch article links from the root page."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch the root page: {url}")
        return {}
    
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = {}
    
    # Find all posts under 'post-entry' class
    for post in soup.find_all('article', class_='post-entry'):
        link_tag = post.find('a', class_='entry-link')
        if link_tag:
            title = link_tag.get('aria-label', 'No Title').replace("post link to ", "").strip()
            link = link_tag['href']
            # Ensure the URL is absolute
            full_url = link if link.startswith('http') else BASE_URL + link
            articles[title] = full_url
    
    return articles

In [None]:
root_url = "https://lilianweng.github.io"
article_links = fetch_article_links(root_url)

In [None]:
def fetch_all_article_links(start_url):
    """Fetch all article links across multiple pages."""
    current_url = start_url
    articles = {}

    while current_url:
        print(f"Fetching page: {current_url}")
        response = requests.get(current_url)
        if response.status_code != 200:
            print(f"Failed to fetch the page: {current_url}")
            break
        
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find articles on the current page
        for post in soup.find_all('article', class_='post-entry'):
            link_tag = post.find('a', class_='entry-link')
            if link_tag:
                title = link_tag.get('aria-label', 'No Title').replace("post link to ", "").strip()
                link = link_tag['href']
                full_url = link if link.startswith('http') else BASE_URL + link
                articles[title] = full_url

        # Check for the "Next" link in the page-footer
        next_page_tag = soup.find('footer', class_='page-footer').find('a', class_='next')
        if next_page_tag and 'href' in next_page_tag.attrs:
            next_page = next_page_tag['href']
            current_url = next_page if next_page.startswith('http') else BASE_URL + next_page
        else:
            # No more pages
            current_url = None

    return articles

In [None]:
article_links = fetch_all_article_links(root_url)

In [None]:
import json

print("Article Links:")
for title, link in article_links.items():
    print(f"Title: {title}, Link: {link}")
    
with open('article_links.json', 'w') as f:
    json.dump(article_links, f, indent=4)

In [None]:
def fetch_article_content(url):
    """Fetch and parse the content of an individual article."""
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to fetch article: {url}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    content_div = soup.find('div', class_='post-content')
    return content_div.text.strip() if content_div else "Content not found"


In [None]:
articles = {}
for title, link in article_links.items():
        print(f"\nFetching content for: {title}")
        content = fetch_article_content(link)
        articles[title] = content
        print(f"Content Preview:\n{content[:500]}...")  # Show first 500 characters

In [None]:
import json
with open('articles.json', 'w') as f:
    json.dump(articles, f, indent=4)

In [None]:
with open('article_links.json', 'r') as f:
    article_li = json.load(f)
print(article_li)