## Dunya 2009-2025 

In [2]:
import requests
import json
from tqdm import tqdm
import time

# Start timer
start_time = time.time()

# Define the CDX API URL and parameters for Jan 2011 – Jan 2025
cdx_url = "http://web.archive.org/cdx/search/cdx"
params = {
    "url": "dunya.com/*",
    "from": "20090101",
    "to": "20250101",
    "output": "json",
    "fl": "timestamp,original",
    "collapse": "urlkey"
}

# Make the request
print("🔍 Fetching 2009–2025 data from the Wayback Machine for dunya.com...")
response = requests.get(cdx_url, params=params)

if response.status_code == 200:
    data = response.json()
    links = data[1:]  # Skip the header

    print(f"📄 Found {len(links)} links. Adding Wayback URLs and saving to JSON...")

    result = []
    for ts, url in tqdm(links, desc="Processing"):
        result.append({
            "timestamp": ts,
            "original_url": url,
            "wayback_url": f"https://web.archive.org/web/{ts}/{url}",
            "wayback_raw_url": f"https://web.archive.org/web/{ts}id_/{url}"
        })

    # Save to JSON file
    with open("dunya_2009_2025_links_with_wayback.json", "w", encoding="utf-8") as f:
        json.dump(result, f, ensure_ascii=False, indent=2)

    end_time = time.time()
    print(f"\n✅ Saved {len(result)} entries to 'dunya_2009_2025_links_with_wayback.json'")
    print(f"⏱️ Time taken: {end_time - start_time:.2f} seconds")

else:
    print("❌ Failed to retrieve data:", response.status_code)

🔍 Fetching 2009–2025 data from the Wayback Machine for dunya.com...
📄 Found 1325706 links. Adding Wayback URLs and saving to JSON...


Processing: 100%|████████████████████████████████████████████████████████| 1325706/1325706 [00:04<00:00, 303341.94it/s]



✅ Saved 1325706 entries to 'dunya_2009_2025_links_with_wayback.json'
⏱️ Time taken: 577.76 seconds
