In [18]:
import requests
import json
import pandas as pd
import time
from pathlib import Path
from bs4 import BeautifulSoup
import logging
from urllib.parse import urljoin, urlparse
import re

In [19]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [20]:
def clean_text(text):
    """Clean and normalize scraped text."""
    if not text:
        return ""
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text.strip())
    return text

In [21]:
def scrape_text_with_links(url, retries=3, delay=1):
    """
    Scrape text content and links from a URL with improved error handling.
    
    Args:
        url (str): URL to scrape
        retries (int): Number of retry attempts
        delay (float): Delay between retries
    
    Returns:
        dict: Contains 'text', 'links', 'title', and 'status'
    """
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
        "Accept-Encoding": "gzip, deflate",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
    }
    
    for attempt in range(retries):
        try:
            logger.info(f"Scraping {url} (attempt {attempt + 1})")
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            
            soup = BeautifulSoup(response.text, "html.parser")
            
            # Extract title
            title_tag = soup.find("title")
            title = clean_text(title_tag.get_text()) if title_tag else "No title"
            
            # Try multiple selectors for content
            content_selectors = [
                "div.post-body",
                "article",
                "main",
                "div.content",
                "div.entry-content",
                ".post-content",
                "#content"
            ]
            
            post_div = None
            for selector in content_selectors:
                post_div = soup.select_one(selector)
                if post_div:
                    break
            
            if not post_div:
                # Fallback: use body but exclude nav, header, footer
                post_div = soup.find("body")
                if post_div:
                    # Remove navigation, headers, footers, sidebars
                    for tag in post_div.find_all(["nav", "header", "footer", "aside", "script", "style"]):
                        tag.decompose()
            
            if not post_div:
                return {
                    "text": "",
                    "links": [],
                    "title": title,
                    "status": "error",
                    "error": "No content found"
                }
            
            # Extract text and links
            text_content = []
            links = []
            
            # Get all text nodes and links
            for elem in post_div.descendants:
                if elem.name == "a" and elem.get("href"):
                    link_text = clean_text(elem.get_text())
                    if link_text:  # Only include links with text
                        href = elem["href"]
                        # Convert relative URLs to absolute
                        if href.startswith(('http://', 'https://')):
                            full_url = href
                        else:
                            full_url = urljoin(url, href)
                        
                        links.append({
                            "text": link_text,
                            "url": full_url
                        })
                elif hasattr(elem, 'string') and elem.string and elem.string.strip():
                    text_content.append(elem.string.strip())
            
            # Combine text content
            full_text = clean_text(" ".join(text_content))
            
            return {
                "text": full_text,
                "links": links,
                "title": title,
                "status": "success",
                "word_count": len(full_text.split()) if full_text else 0
            }
            
        except requests.exceptions.Timeout:
            logger.warning(f"Timeout for {url} on attempt {attempt + 1}")
        except requests.exceptions.RequestException as e:
            logger.warning(f"Request error for {url} on attempt {attempt + 1}: {e}")
        except Exception as e:
            logger.error(f"Unexpected error for {url} on attempt {attempt + 1}: {e}")
        
        if attempt < retries - 1:
            time.sleep(delay * (attempt + 1))  # Exponential backoff
    
    return {
        "text": "",
        "links": [],
        "title": "Failed to scrape",
        "status": "failed",
        "error": f"Failed after {retries} attempts"
    }

In [22]:
def save_individual_file(content, url, output_folder, file_format="txt"):
    """Save scraped content to individual file."""
    try:
        # Create safe filename from URL
        parsed_url = urlparse(url)
        filename = f"{parsed_url.netloc}_{parsed_url.path}".replace("/", "_").replace("\\", "_")
        filename = re.sub(r'[<>:"|?*]', '_', filename)  # Remove invalid characters
        filename = filename[:100]  # Limit length
        
        if file_format == "txt":
            filepath = output_folder / f"{filename}.txt"
            with open(filepath, "w", encoding="utf-8") as f:
                f.write(f"URL: {url}\n")
                f.write(f"Title: {content['title']}\n")
                f.write(f"Word Count: {content.get('word_count', 0)}\n")
                f.write("-" * 50 + "\n\n")
                f.write(content['text'])
                
                if content['links']:
                    f.write("\n\n" + "="*50 + "\n")
                    f.write("LINKS FOUND:\n")
                    f.write("="*50 + "\n\n")
                    for link in content['links']:
                        f.write(f"Text: {link['text']}\n")
                        f.write(f"URL: {link['url']}\n\n")
        
        elif file_format == "json":
            filepath = output_folder / f"{filename}.json"
            with open(filepath, "w", encoding="utf-8") as f:
                json.dump({
                    "source_url": url,
                    "scraped_content": content,
                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
                }, f, indent=2, ensure_ascii=False)
        
        return filepath
        
    except Exception as e:
        logger.error(f"Error saving file for {url}: {e}")
        return None

In [23]:
def process_urls_from_json(json_base_folder, csv_base_folder, save_individual_files=True, 
                          individual_files_folder=None, file_format="txt", batch_size=10):
    """
    Enhanced processing function with individual file saving option.
    
    Args:
        json_base_folder (Path): Folder containing JSON files organized by year
        csv_base_folder (Path): Output folder for CSV files
        save_individual_files (bool): Whether to save individual text files
        individual_files_folder (Path): Folder for individual files (optional)
        file_format (str): 'txt' or 'json' for individual files
        batch_size (int): Number of URLs to process before saving progress
    """
    # Convert to Path objects
    json_base_folder = Path(json_base_folder)
    csv_base_folder = Path(csv_base_folder)
    
    if save_individual_files:
        if individual_files_folder is None:
            individual_files_folder = csv_base_folder / "scraped_content"
        individual_files_folder = Path(individual_files_folder)
        individual_files_folder.mkdir(parents=True, exist_ok=True)
    
    total_processed = 0
    total_successful = 0
    
    # Process each year folder
    for year_folder in sorted(json_base_folder.iterdir()):
        if not year_folder.is_dir():
            continue
            
        year = year_folder.name
        csv_year_folder = csv_base_folder / year
        csv_year_folder.mkdir(parents=True, exist_ok=True)
        
        logger.info(f"Processing year: {year}")
        
        # Process each month file
        for json_file in sorted(year_folder.glob("*.json")):
            month = json_file.stem
            logger.info(f"Processing {year}/{month}")
            
            try:
                with open(json_file, "r", encoding="utf-8") as f:
                    data = json.load(f)
                
                if not data:
                    logger.info(f"No data in {json_file}")
                    continue
                
                # Convert to DataFrame
                df = pd.json_normalize(data)
                
                if "pageUrl" not in df.columns:
                    logger.error(f"No 'pageUrl' column in {json_file}")
                    continue
                
                # Initialize new columns
                df["scraped_text"] = ""
                df["scraped_links"] = ""
                df["scrape_status"] = ""
                df["word_count"] = 0
                df["scrape_error"] = ""
                
                # Process URLs in batches
                for i, row in df.iterrows():
                    url = row["pageUrl"]
                    content = scrape_text_with_links(url)
                    
                    # Update DataFrame
                    df.at[i, "scraped_text"] = content["text"]
                    df.at[i, "scraped_links"] = json.dumps(content["links"]) if content["links"] else ""
                    df.at[i, "scrape_status"] = content["status"]
                    df.at[i, "word_count"] = content.get("word_count", 0)
                    df.at[i, "scrape_error"] = content.get("error", "")
                    
                    total_processed += 1
                    if content["status"] == "success":
                        total_successful += 1
                    
                    # Save individual file if requested
                    if save_individual_files and content["status"] == "success":
                        year_individual_folder = individual_files_folder / year
                        year_individual_folder.mkdir(parents=True, exist_ok=True)
                        saved_file = save_individual_file(content, url, year_individual_folder, file_format)
                        if saved_file:
                            logger.info(f"Saved individual file: {saved_file}")
                    
                    # Progress update
                    if (i + 1) % batch_size == 0:
                        logger.info(f"Processed {i + 1}/{len(df)} URLs from {month}")
                        # Save progress
                        csv_file = csv_year_folder / f"{month}.csv"
                        df.to_csv(csv_file, index=False, encoding="utf-8")
                    
                    # Rate limiting
                    time.sleep(1.5)  # Be respectful to servers
                
                # Final save for this month
                csv_file = csv_year_folder / f"{month}.csv"
                df.to_csv(csv_file, index=False, encoding="utf-8")
                
                success_rate = sum(df["scrape_status"] == "success") / len(df) * 100
                logger.info(f"✅ Saved {csv_file} with {len(df)} posts ({success_rate:.1f}% success rate)")
                
            except Exception as e:
                logger.error(f"Error processing {json_file}: {e}")
                continue
    
    logger.info(f"COMPLETE: Processed {total_processed} URLs, {total_successful} successful ({total_successful/total_processed*100:.1f}%)")


In [24]:
# Configure your paths
JSON_BASE_FOLDER = Path("lw_json")  # Folder with year subfolders
CSV_BASE_FOLDER = Path("lw_csv")       # Output folder for CSVs
    
# Create output directory
CSV_BASE_FOLDER.mkdir(parents=True, exist_ok=True)
    
# Run the scraping process
process_urls_from_json(
    json_base_folder=JSON_BASE_FOLDER,
    csv_base_folder=CSV_BASE_FOLDER,
    save_individual_files=True,  # Set to False if you don't want individual files
    file_format="txt",           # or "json"
    batch_size=10               # Save progress every N URLs
    )

2025-08-30 09:18:28,259 - INFO - Processing year: 2016
2025-08-30 09:18:28,261 - INFO - Processing 2016/2016-01
2025-08-30 09:18:28,283 - INFO - Scraping https://www.lesswrong.com/posts/HpsExWAYKHA6xNy76/ai-safety-in-the-age-of-neural-networks-and-stanislaw-lem (attempt 1)
2025-08-30 09:18:28,893 - INFO - Saved individual file: lw_csv/scraped_content/2016/www.lesswrong.com__posts_HpsExWAYKHA6xNy76_ai-safety-in-the-age-of-neural-networks-and-stanislaw-lem.txt
2025-08-30 09:18:30,399 - INFO - Scraping https://www.lesswrong.com/posts/i6LDrasYK2y6GgQuW/meetup-lw-melb-rationality-dojo-including-critical-thoughts (attempt 1)
2025-08-30 09:18:30,654 - INFO - Saved individual file: lw_csv/scraped_content/2016/www.lesswrong.com__posts_i6LDrasYK2y6GgQuW_meetup-lw-melb-rationality-dojo-including-critical-though.txt
2025-08-30 09:18:32,159 - INFO - Scraping https://www.lesswrong.com/posts/2ZxBjuv88cgmSjjbc/identifying-bias-a-bayesian-analysis-of-suspicious-agreement (attempt 1)
2025-08-30 09:18:33

KeyboardInterrupt: 