In [2]:
import os
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urljoin, urlparse
from datetime import datetime
import re
from rake_nltk import Rake
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/stedn/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
rake = Rake()

# Base URL of the site
BASE_URL = "https://flightfree.org"  # Replace with the actual site
START_PAGE_URL = f"{BASE_URL}"  # Page with the summary-read-more-link links

HEADERS = {"User-Agent": "Mozilla/5.0"}

# Directories
IMAGE_DIR = "images"
POSTS_DIR = "_posts"
os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(POSTS_DIR, exist_ok=True)

# Fetch main listing page
response = requests.get(START_PAGE_URL, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

# Find article links
read_more_links = soup.find_all("a", class_="summary-read-more-link")

data = []

for link in read_more_links:
    relative_url = link.get("href")
    full_url = urljoin(BASE_URL, relative_url)

    article_response = requests.get(full_url, headers=HEADERS)
    article_soup = BeautifulSoup(article_response.text, "html.parser")

    # Title
    title_tag = article_soup.find("h1", class_="entry-title")
    title = title_tag.get_text(strip=True) if title_tag else "Untitled"
    title = re.sub(r'[\\/*?:"<>|]', "", title)  # Clean title for filenames

    # Author
    author_tag = article_soup.find("span", class_="entry-author")
    author = author_tag.get_text(strip=True) if author_tag else "Unknown"

    # Date
    date_tag = article_soup.find("time", class_="dt-published")
    date_str = date_tag["datetime"] if date_tag and "datetime" in date_tag.attrs else datetime.now().isoformat()
    date_obj = datetime.fromisoformat(date_str)
    date_formatted = date_obj.strftime("%Y-%m-%d %H:%M:%S +0300")  # Adjust timezone as needed
    date_filename = date_obj.strftime("%Y-%m-%d")

    # Content
    content_div = article_soup.find("div", class_="sqs-html-content")
    content = content_div.get_text(strip=True) if content_div else ""

    description = content[:150] + "..." if len(content) > 150 else content
    description = re.sub(r'[\\/*?:"<>|]', "", description)  # Clean description

    # Image
    img_tag = article_soup.find_all("img")[1]
    img_url = img_tag["src"] if img_tag and "src" in img_tag.attrs else ""
    img_filename = ""

    # Create slug from title
    slug = re.sub(r'\W+', '-', title.lower()).strip('-')

    if img_url:
        # Get the image file extension
        img_ext = os.path.splitext(urlparse(img_url).path)[1]  # e.g., ".jpg", ".jpeg", ".png"
        
        # Use the slug (from earlier) to name the image file
        img_name = f"{slug}{img_ext}"
        img_path = os.path.join(IMAGE_DIR, img_name)

        try:
            img_data = requests.get(img_url, headers=HEADERS).content
            with open(img_path, "wb") as f:
                f.write(img_data)
            img_filename = f"/images/{img_name}"
        except Exception as e:
            print(f"Failed to download image {img_url}: {e}")

    rake.extract_keywords_from_text(content)
    keywords = rake.get_ranked_phrases_with_scores()
    single_word_keywords = [kw for score, kw in keywords if len(kw.strip().split()) < 3]

    # Get top N keywords, e.g., top 5
    top_keywords = [kw for kw in single_word_keywords[:8]]
    # Optionally, turn into slug-style tags
    tags = [re.sub(r'\W+', '-', kw.lower()).strip('-') for kw in top_keywords]

    # Prepare markdown content
    markdown_content = f"""---
layout: post
title:  {title}
description: {content[:150]}...
date:   {date_formatted}
image:  '{img_filename}'
tags:   {tags}
---
{content}

"""

    # Save markdown file
    md_filename = f"{date_filename}-{slug}.md"
    md_path = os.path.join(POSTS_DIR, md_filename)

    with open(md_path, "w", encoding="utf-8") as f:
        f.write(markdown_content)

    # Store metadata in DataFrame
    data.append({
        "Title": title,
        "Author": author,
        "Date": date_formatted,
        "Content": content,
        "Image Path": img_filename,
        "Article URL": full_url,
        "Markdown File": md_filename
    })

# Create and save DataFrame
df = pd.DataFrame(data)
df.to_csv("articles_data.csv", index=False)

print(f"✅ {len(df)} posts written to '{POSTS_DIR}/' and images to '{IMAGE_DIR}/'")



✅ 148 posts written to '_posts/' and images to 'images/'
