In [3]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
from datetime import datetime
import re
from rake_nltk import Rake
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/stedn/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
rake = Rake()

# Base URL of the site
BASE_URL = "https://flightfree.org"  # Replace with the actual site
START_PAGE_URL = f"{BASE_URL}"  # Page with the summary-read-more-link links

HEADERS = {"User-Agent": "Mozilla/5.0"}

# Directories
IMAGE_DIR = "images"
POSTS_DIR = "_posts"
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(POSTS_DIR, exist_ok=True)

# Fetch main listing page
response = requests.get(START_PAGE_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

# Find article links
read_more_links = soup.find_all("a", class_="summary-read-more-link")

data = []

for link in read_more_links:
    relative_url = link.get("href")
    full_url = urljoin(BASE_URL, relative_url)

    article_response = requests.get(full_url, headers=HEADERS)
    article_soup = BeautifulSoup(article_response.text, "html.parser")

    # Title
    title_tag = article_soup.find("h1", class_="entry-title")
    title = title_tag.get_text(strip=True) if title_tag else "Untitled"

    # Author
    author_tag = article_soup.find("span", class_="entry-author")
    author = author_tag.get_text(strip=True) if author_tag else "Unknown"

    # Date
    date_tag = article_soup.find("time", class_="dt-published")
    date_str = date_tag["datetime"] if date_tag and "datetime" in date_tag.attrs else datetime.now().isoformat()
    date_obj = datetime.fromisoformat(date_str)
    date_formatted = date_obj.strftime("%Y-%m-%d %H:%M:%S +0300")  # Adjust timezone as needed
    date_filename = date_obj.strftime("%Y-%m-%d")

    # Content
    content_div = article_soup.find("div", class_="sqs-html-content")
    content = content_div.get_text(strip=True) if content_div else ""

    # Image
    img_tag = article_soup.find_all("img")[1]
    img_url = img_tag["src"] if img_tag and "src" in img_tag.attrs else ""
    img_filename = ""

    if img_url:
        # Download image
        img_name = os.path.basename(urlparse(img_url).path)
        img_path = os.path.join(IMAGE_DIR, img_name)
        try:
            img_data = requests.get(img_url, headers=HEADERS).content
            with open(img_path, "wb") as f:
                f.write(img_data)
            img_filename = f"/images/{img_name}"
        except Exception as e:
            print(f"Failed to download image {img_url}: {e}")

    # Create slug from title
    slug = re.sub(r'\W+', '-', title.lower()).strip('-')

    rake.extract_keywords_from_text(content)
    keywords = rake.get_ranked_phrases_with_scores()
    single_word_keywords = [kw for score, kw in keywords if len(kw.strip().split()) < 3]

    # Get top N keywords, e.g., top 5
    top_keywords = [kw for kw in single_word_keywords[:8]]
    # Optionally, turn into slug-style tags
    tags = [re.sub(r'\W+', '-', kw.lower()).strip('-') for kw in top_keywords]

    # Prepare markdown content
    markdown_content = f"""---
layout: post
title:  {title}
description: {content[:150]}...
date:   {date_formatted}
image:  '{img_filename}'
tags:   {tags}
---
{content}

"""

    # Save markdown file
    md_filename = f"{date_filename}-{slug}.md"
    md_path = os.path.join(POSTS_DIR, md_filename)

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(markdown_content)

    # Store metadata in DataFrame
    data.append({
        "Title": title,
        "Author": author,
        "Date": date_formatted,
        "Content": content,
        "Image Path": img_filename,
        "Article URL": full_url,
        "Markdown File": md_filename
    })

# Create and save DataFrame
df = pd.DataFrame(data)
df.to_csv("articles_data.csv", index=False)

print(f"✅ {len(df)} posts written to '{POSTS_DIR}/' and images to '{IMAGE_DIR}/'")



KeyboardInterrupt: 

In [6]:
keywords

[(15.5, 'mountains without flying 10'),
 (14.333333333333334, 'positive carbon emission feedbacks'),
 (9.5, 'crowded mountains overseas'),
 (9.0, 'wind river range'),
 (9.0, 'north america offer'),
 (9.0, 'million metric tons'),
 (9.0, 'foreign mountain ranges'),
 (4.333333333333334, 'carbon footprint'),
 (4.0, 'wrong direction'),
 (4.0, 'sierra nevada'),
 (4.0, 'outdoor enthusiast'),
 (4.0, 'one example'),
 (4.0, 'melting permafrost'),
 (4.0, 'highly worried'),
 (4.0, 'extremely happy'),
 (4.0, 'energy intensive'),
 (4.0, 'electric vehicle'),
 (4.0, 'canadian rockies'),
 (4.0, '20 hours'),
 (2.3333333333333335, 'carbon'),
 (1.0, 'venture'),
 (1.0, 'sending'),
 (1.0, 'scenery'),
 (1.0, 'reason'),
 (1.0, 'reached'),
 (1.0, 'planet'),
 (1.0, 'planet'),
 (1.0, 'patagonia'),
 (1.0, 'much'),
 (1.0, 'moreover'),
 (1.0, 'least'),
 (1.0, 'future'),
 (1.0, 'furthermore'),
 (1.0, 'fly'),
 (1.0, 'fly'),
 (1.0, 'find'),
 (1.0, 'facing'),
 (1.0, 'earth'),
 (1.0, 'driving'),
 (1.0, 'destinations'),
