In [3]:
# --- Imports ---
import os, re, textwrap, datetime as dt
import pandas as pd
import requests
from bs4 import BeautifulSoup

# --- Config ---
URL = "https://en.wikipedia.org/wiki/Key_events_of_the_20th_century"
RAW_DIR = os.path.join("..", "data", "raw")
os.makedirs(RAW_DIR, exist_ok=True)

# --- Check robots.txt quickly ---
robots = requests.get("https://en.wikipedia.org/robots.txt", headers={"User-Agent": "Mozilla/5.0"})
print("robots.txt status:", robots.status_code)

# --- Request page with a user-agent to reduce 403 risk ---
headers = {"User-Agent": "Mozilla/5.0"}
resp = requests.get(URL, headers=headers, timeout=30)
resp.raise_for_status()

# --- Parse the article body only (avoid nav/menus) ---
soup = BeautifulSoup(resp.text, "html.parser")
article = soup.select_one("div.mw-parser-output")

# Fallback if structure changes
if article is None:
    article = soup

# --- Extract structured text: headings, paragraphs, lists ---
parts = []

# Headings (for basic structure)
for tag in article.select("h2, h3, h4"):
    # Wikipedia headings have [edit] spans; strip them
    heading = tag.get_text(" ", strip=True)
    heading = re.sub(r"\[.*?edit.*?\]", "", heading, flags=re.I)
    parts.append(f"\n\n## {heading}\n")

# Paragraphs and list items
for p in article.select("p"):
    txt = p.get_text(" ", strip=True)
    if txt:
        parts.append(txt)

for li in article.select("ul li"):
    txt = li.get_text(" ", strip=True)
    if txt:
        parts.append("- " + txt)

# --- Join & light clean ---
text = "\n".join(parts)
# collapse excessive whitespace
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text).strip()

# --- Add provenance & timestamp (for later attribution) ---
stamp = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")
header = textwrap.dedent(f"""
    SOURCE: {URL}
    DOWNLOADED: {stamp}
    LICENSE: Wikipedia content is under CC BY-SA 4.0.
""").strip()

final_text = header + "\n\n" + text

# --- Save to TXT ---
out_path = os.path.join(RAW_DIR, "key_events_20th_century_raw.txt")
with open(out_path, "w", encoding="utf-8") as f:
    f.write(final_text)

out_path

robots.txt status: 200


  stamp = dt.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S UTC")


'..\\data\\raw\\key_events_20th_century_raw.txt'

In [4]:
with open(out_path, "r", encoding="utf-8") as f:
    preview = "".join([next(f) for _ in range(40)])
print(preview)

SOURCE: https://en.wikipedia.org/wiki/Key_events_of_the_20th_century
DOWNLOADED: 2025-08-18 10:45:33 UTC
LICENSE: Wikipedia content is under CC BY-SA 4.0.

## Historic events in the 20th century

## World at the beginning of the century

## "The war to end all wars": World War I (1914–1918)

## Spanish flu

## Russian Revolution and communism

## Between the wars

## Economic depression

## The rise of dictatorship

## Global war: World War II (1939–1945)

## The war in Europe

## Blitzkrieg

## Operation Barbarossa

## Turning tides

## Operation Overlord

## Final days

## The war in the Pacific

## Japanese Expansion

## Allied offensive




In [7]:
import os

# Look for the file we saved
for root, dirs, files in os.walk(".", topdown=True):
    for name in files:
        if "20th_century" in name:
            print(os.path.join(root, name))

.\20th_century_scrape.ipynb
.\key_events_20th_century_raw.txt
.\.ipynb_checkpoints\20th_century_scrape-checkpoint.ipynb
.\.virtual_documents\20th_century_scrape.ipynb
