In [36]:
# Download pages in raw wiki markup format to txt files
import requests
import time
import os
import glob


# Chose an interval of pages, that haven't already been downloaded
# ==== CONFIGURATION ====
START_PAGE = 7001      # Start from this page number
END_PAGE = 8000      # End at this page number (inclusive)
OUTPUT_FOLDER = "GoT_files"


# HUSK AT BRUGE DIT NAVN!!!!!!
# The script will write worker-specific files like fetched_pages_<name>.txt
# WORKER_NAME = "mathias"
# WORKER_NAME = "nikolai"
WORKER_NAME = "sofie" 
# =======================

def get_page_wikitext(page_title):
    """Get raw wiki markup for a page"""
    url = "https://gameofthrones.fandom.com/api.php"
    
    params = {
        "action": "query",
        "prop": "revisions",
        "titles": page_title,
        "rvprop": "content",
        "format": "json",
        "rvslots": "main"
    }
    
    try:
        response = requests.get(url, params=params)
        data = response.json()
        
        page = next(iter(data["query"]["pages"].values()))
        
        # Check if page exists and has content
        if "revisions" in page:
            wikitext = page["revisions"][0]["slots"]["main"]["*"]
            return wikitext
        else:
            return None
    except Exception as e:
        print(f"    Error: {e}")
        return None

def save_wikitext_to_file(page_title, wikitext, output_dir):
    """Save wikitext to a file"""
    # Create directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Clean the filename (remove invalid characters)
    filename = page_title.replace("/", "_").replace("\\", "_").replace(":", "_")
    filename = filename.replace("?", "_").replace("*", "_").replace("|", "_")
    filename = filename.replace("<", "_").replace(">", "_").replace('"', "_")
    filepath = os.path.join(output_dir, f"{filename}.txt")
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(wikitext)
    
    return filepath

# Get all page titles first
print(f"Fetching page titles from the wiki...")
url = "https://gameofthrones.fandom.com/api.php"
all_page_titles = []
apcontinue = None

while len(all_page_titles) < END_PAGE:
    params = {
        "action": "query",
        "list": "allpages",
        "aplimit": "500",
        "format": "json"
    }
    
    if apcontinue:
        params["apcontinue"] = apcontinue
    
    response = requests.get(url, params=params)
    data = response.json()
    
    pages = [page['title'] for page in data['query']['allpages']]
    all_page_titles.extend(pages)
    
    print(f"  Fetched {len(all_page_titles)} page titles...")
    
    if 'continue' in data and 'apcontinue' in data['continue']:
        apcontinue = data['continue']['apcontinue']
    else:
        break

# Select the page range
pages_to_download = all_page_titles[START_PAGE-1:END_PAGE]
total_pages = len(pages_to_download)

print(f"Downloading pages {START_PAGE} to {END_PAGE} ({total_pages} pages)")


successful = 0
failed = 0
redirects = 0
failed_pages = []

# Prepare fetched pages tracking file inside the output folder
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Determine worker-specific filenames
if not WORKER_NAME:
    # Try environment fallbacks if WORKER_NAME not set
    WORKER_NAME = os.getenv('USER') or os.getenv('USERNAME') or 'worker'
safe_worker = WORKER_NAME.replace(' ', '_')
worker_fetched_file = os.path.join(OUTPUT_FOLDER, f"fetched_pages_{safe_worker}.txt")
worker_redirects_file = os.path.join(OUTPUT_FOLDER, f"redirects_{safe_worker}.txt")
worker_failed_file = os.path.join(OUTPUT_FOLDER, f"failed_pages_{safe_worker}.txt")
# Build processed set by reading all workers' fetched files so we avoid re-downloading what others already fetched
processed = set()
for path in glob.glob(os.path.join(OUTPUT_FOLDER, 'fetched_pages_*.txt')):
    try:
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    processed.add(int(line))
                except ValueError:
                    pass
    except Exception:
        # ignore unreadable files
        pass

for i, page_title in enumerate(pages_to_download, 1):
    actual_page_num = START_PAGE + i - 1

    # Skip pages that were already processed (downloaded or redirected)
    if actual_page_num in processed:
        print(f"[{actual_page_num}/{END_PAGE}] Skipping (already fetched)")
        continue

    print(f"[{actual_page_num}/{END_PAGE}] Downloading: {page_title}...\n", end=" ")
    
    wikitext = get_page_wikitext(page_title)
    
    if wikitext:
        # Check if this is a redirect page (case-insensitive)
        if wikitext.strip().upper().startswith("#REDIRECT"):
            redirects += 1
            # Log the redirect to a worker-specific file inside the output folder
            with open(worker_redirects_file, "a", encoding="utf-8") as f:
                f.write(f"{page_title} -> {wikitext.strip()}\n")
            # Record the interval number for this redirected page to this worker's fetched file
            with open(worker_fetched_file, "a", encoding="utf-8") as fnum:
                fnum.write(f"{actual_page_num}\n")
            processed.add(actual_page_num)
        else:
            filepath = save_wikitext_to_file(page_title, wikitext, OUTPUT_FOLDER)
            successful += 1
            # Record the interval number for the successfully downloaded page to this worker's file
            with open(worker_fetched_file, "a", encoding="utf-8") as fnum:
                fnum.write(f"{actual_page_num}\n")
            processed.add(actual_page_num)
    else:
        failed += 1
        failed_pages.append(page_title)
        with open(worker_failed_file, "a", encoding="utf-8") as ffail:
            ffail.write(f"{page_title}\n")
        print("✗ Failed")
    
    # Be polite to the server
    time.sleep(0.2)

print(f"\n{'='*60}")
print("DOWNLOAD COMPLETE!")
print(f"{'='*60}")
print(f"Successfully saved: {successful} pages")
print(f"Redirects (skipped): {redirects} pages")
print(f"Failed: {failed} pages")
if failed_pages:
    print(f"\nFailed pages:")
    for page in failed_pages[:10]:
        print(f"  - {page}")
    if len(failed_pages) > 10:
        print(f"  ... and {len(failed_pages) - 10} more")
if redirects > 0:
    print(f"\nRedirects logged to: {worker_redirects_file}")
print(f"\nFiles saved in: {os.path.abspath(OUTPUT_FOLDER)}")

Fetching page titles from the wiki...
  Fetched 500 page titles...
  Fetched 1000 page titles...
  Fetched 1500 page titles...
  Fetched 2000 page titles...
  Fetched 2500 page titles...
  Fetched 3000 page titles...
  Fetched 3500 page titles...
  Fetched 4000 page titles...
  Fetched 4500 page titles...
  Fetched 5000 page titles...
  Fetched 5500 page titles...
  Fetched 6000 page titles...
  Fetched 6500 page titles...
  Fetched 7000 page titles...
  Fetched 7500 page titles...
  Fetched 7640 page titles...
Downloading pages 7001 to 8000 (640 pages)
[7001/8000] Downloading: Tom Cox...
 [7002/8000] Downloading: Tom Crowley Ellis...
 [7003/8000] Downloading: Tom Cullen...
 [7004/8000] Downloading: Tom Dunn...
 [7005/8000] Downloading: Tom Fava...
 [7006/8000] Downloading: Tom Glynn-Carney...
 [7007/8000] Downloading: Tom Hopper...
 [7008/8000] Downloading: Tom Lorcan...
 [7009/8000] Downloading: Tom Taylor...
 [7010/8000] Downloading: Tom Varey...
 [7011/8000] Downloading: Tom Vaugha