In [6]:
import requests
from bs4 import BeautifulSoup
import csv
import os
from waybackpy import WaybackMachineCDXServerAPI
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# Define the URL of the website you want to scrape
url = "http://cesta.stanford.edu"

# Create the folder for storing HTML files if it doesn't exist
folder_name = "wayback_html"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Create or open the CSV file for writing the date and URL
csv_filename = "scraped_wayback_urls.csv"
file_exists = os.path.isfile(csv_filename)

with open(csv_filename, mode='a', newline='', encoding='utf-8') as csv_file:
    csv_writer = csv.writer(csv_file)
    if not file_exists:
        # Write the header only if the file doesn't exist
        csv_writer.writerow(["Date", "URL"])

    # Step 1: Get all available captures from Wayback Machine
    wayback = WaybackMachineCDXServerAPI(url)
    captures = wayback.snapshots()

    # Setup retry strategy
    retry_strategy = Retry(
        total=5,
        backoff_factor=1,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["HEAD", "GET", "OPTIONS"]
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    http.mount("http://", adapter)

    # Step 2: Iterate through each capture, scrape the HTML, and save it
    for capture in captures:
        archive_url = capture.archive_url
        date_part = capture.timestamp

        try:
            # Scrape the archived website
            response = http.get(archive_url, timeout=10)
            response.raise_for_status()  # Raise an HTTPError for bad responses

            if response.status_code == 200:
                html_content = response.text

                # Save the HTML content to a file named with the date
                html_filename = f"{folder_name}/{date_part}.html"
                with open(html_filename, "w", encoding='utf-8') as html_file:
                    html_file.write(html_content)

                # Save the date and URL to the CSV file
                csv_writer.writerow([date_part, archive_url])

                print(f"Successfully saved HTML content and updated CSV file for {date_part}.")
            else:
                print(f"Failed to retrieve the page for {date_part}. Status code: {response.status_code}")

        except (requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e:
            print(f"SSL or connection error for {date_part}: {e}")
        except requests.exceptions.Timeout as e:
            print(f"Timeout error for {date_part}: {e}")
        except requests.exceptions.RequestException as e:
            print(f"Request exception for {date_part}: {e}")

TypeError: Retry.__init__() got an unexpected keyword argument 'method_whitelist'