In [15]:
from itertools import count
from bs4 import BeautifulSoup
import requests

# Function to extract various information from a webpage
def extract_website_info(url):
    try:
        # Send a GET request to the webpage
        response = requests.get(url)
        
        # Check if the request was successful
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract metadata
            metadata = extract_metadata(soup)
            
            # Extract text content
            text_content = extract_text_content(soup)
            
            # Extract links
            links = extract_links(soup)
            
            # Extract images
            images = extract_images(soup)
            
            # Extract tables
            tables = extract_tables(soup)
            
            # Construct dictionary to store all extracted information
            extracted_info = {
                'metadata': metadata,
                'text_content': text_content,
                'links': links,
                'images': images,
                'tables': tables
            }
            
            return extracted_info
        else:
            print("Error: Unable to fetch the webpage. Status code:", response.status_code)
            return None
    except Exception as e:
        print("Error:", e)
        return None

# Function to extract metadata
def extract_metadata(soup):
    metadata = {}
    title = soup.find('title').get_text() if soup.find('title') else None
    metadata['title'] = title
    return metadata

# Function to extract text content
def extract_text_content(soup):
    text_content = []
    for element in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']):
        text_content.append(element.get_text())
    return text_content

# Function to extract links
def extract_links(soup):
    links = []
    for link in soup.find_all('a', href=True):
        links.append(link['href'])
    return links

# Function to extract images
def extract_images(soup):
    images = []
    for img in soup.find_all('img', src=True):
        images.append(img['src'])
    return images

# Function to extract tables
def extract_tables(soup):
    tables = []
    for table in soup.find_all('table'):
        table_data = []
        for row in table.find_all('tr'):
            row_data = [cell.get_text() for cell in row.find_all('td')]
            table_data.append(row_data)
        tables.append(table_data)
    return tables

# Example usage
url = 'https://archive.org/web/'
extracted_info = extract_website_info(url)
if extracted_info:
    print("Metadata:")
    print(extracted_info['metadata'])
    print("\nText Content:")
    print(extracted_info['text_content'])
    print("\nLinks:")
    print(extracted_info['links'])
    print("\nImages:")
    print(extracted_info['images'])
    print("\nTables:")
    print(extracted_info['tables'])


Metadata:
{'title': 'Internet Archive: Wayback Machine'}

Text Content:
['We will keep fighting for all libraries - stand with us!', 'Internet Archive Audio', 'Featured', 'All Audio', 'This Just In', 'Grateful Dead', 'Netlabels', 'Old Time Radio', '78 RPMs and Cylinder Recordings', 'Top', 'Audio Books & Poetry', 'Computers, Technology and Science', 'Music, Arts & Culture', 'News & Public Affairs', 'Spirituality & Religion', 'Podcasts', 'Radio News Archive', 'Images', 'Featured', 'All Images', 'This Just In', 'Flickr Commons', 'Occupy Wall Street Flickr', 'Cover Art', 'USGS Maps', 'Top', 'NASA Images', 'Solar System Collection', 'Ames Research Center', 'Software', 'Featured', 'All Software', 'This Just In', 'Old School Emulation', 'MS-DOS Games', 'Historical Software', 'Classic PC Games', 'Software Library', 'Top', 'Kodi Archive and Support File', 'Vintage Software', 'APK', 'MS-DOS', 'CD-ROM Software', 'CD-ROM Software Library', 'Software Sites', 'Tucows Software Library', 'Shareware CD

In [4]:
import requests
from bs4 import BeautifulSoup

# URL of the Wayback Machine website
url = "https://archive.org/web/"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the navigation elements
navigation_elements = soup.find_all("a")

# Extract the text and URLs of navigation elements
navigation_data = []
for element in navigation_elements:
    text = element.get_text().strip()
    url = element.get("href", "").strip()  # Get the "href" attribute or empty string if it doesn't exist
    navigation_data.append((text, url))

# Print the extracted navigation data
for text, url in navigation_data:
    print("Text:", text)
    print("URL:", url)
    print("----------------------")


Text: stand with us!
URL: https://blog.archive.org/2023/03/25/the-fight-continues/
----------------------
Text: Internet Archive logo
A line drawing of the Internet Archive headquarters building façade.
URL: https://archive.org/
----------------------
Text: Donate icon
An illustration of a heart shape



"Donate to the archive"
URL: https://archive.org/donate/?origin=iawww-mbhrt
----------------------
Text: Upload icon
An illustration of a horizontal line over an up pointing arrow.


Upload
URL: https://archive.org/create
----------------------
Text: User icon
An illustration of a person's head and chest.
URL: 
----------------------
Text: Sign up
URL: https://archive.org/account/signup
----------------------
Text: Log in
URL: https://archive.org/account/login
----------------------
Text: Web icon
An illustration of a computer application window



Wayback Machine
URL: https://web.archive.org
----------------------
Text: Texts icon
An illustration of an open book.



Books
URL: https:/

In [5]:
import requests
from bs4 import BeautifulSoup

# URL of the Wayback Machine website
url = "https://archive.org/web/"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the navigation elements and input fields
navigation_elements = soup.find_all(["a", "input"])

# Extract the text, URLs, and input field attributes
navigation_data = []
for element in navigation_elements:
    if element.name == "a":  # If it's a link
        text = element.get_text().strip()
        url = element.get("href", "").strip()  # Get the "href" attribute or empty string if it doesn't exist
        navigation_data.append((text, url))
    elif element.name == "input":  # If it's an input field
        label = element.get("aria-label", "").strip()  # Get the "aria-label" attribute or empty string if it doesn't exist
        input_type = element.get("type", "").strip()  # Get the "type" attribute or empty string if it doesn't exist
        name = element.get("name", "").strip()  # Get the "name" attribute or empty string if it doesn't exist
        navigation_data.append(("Input", label, input_type, name))

# Print the extracted navigation data
for item in navigation_data:
    if len(item) == 2:  # If it's a link
        text, url = item
        print("Text:", text)
        print("URL:", url)
    elif len(item) == 4:  # If it's an input field
        print("Input")
        print("Label:", item[1])
        print("Type:", item[2])
        print("Name:", item[3])
    print("----------------------")


Text: stand with us!
URL: https://blog.archive.org/2023/03/25/the-fight-continues/
----------------------
Input
Label: 
Type: hidden
Name: 
----------------------
Text: Internet Archive logo
A line drawing of the Internet Archive headquarters building façade.
URL: https://archive.org/
----------------------
Text: Donate icon
An illustration of a heart shape



"Donate to the archive"
URL: https://archive.org/donate/?origin=iawww-mbhrt
----------------------
Input
Label: 
Type: text
Name: query
----------------------
Text: Upload icon
An illustration of a horizontal line over an up pointing arrow.


Upload
URL: https://archive.org/create
----------------------
Text: User icon
An illustration of a person's head and chest.
URL: 
----------------------
Text: Sign up
URL: https://archive.org/account/signup
----------------------
Text: Log in
URL: https://archive.org/account/login
----------------------
Text: Web icon
An illustration of a computer application window



Wayback Machine
URL: h

In [6]:
import requests
from bs4 import BeautifulSoup

# URL of the Wayback Machine website
url = "https://archive.org/web/"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the navigation elements and input fields
navigation_elements = soup.find_all(["a", "input"])

# Extract the text, URLs, and input field attributes
navigation_data = []
for element in navigation_elements:
    if element.name == "a":  # If it's a link
        text = element.get_text().strip()
        url = element.get("href", "").strip()  # Get the "href" attribute or empty string if it doesn't exist
        navigation_data.append((text, url))
    elif element.name == "input":  # If it's an input field
        label = element.get("aria-label", "").strip()  # Get the "aria-label" attribute or empty string if it doesn't exist
        input_type = element.get("type", "").strip()  # Get the "type" attribute or empty string if it doesn't exist
        name = element.get("name", "").strip()  # Get the "name" attribute or empty string if it doesn't exist
        _id = element.get("id", "").strip()  # Get the "id" attribute or empty string if it doesn't exist
        navigation_data.append(("Input", label, input_type, name, _id))

# Print the extracted navigation data
for item in navigation_data:
    if len(item) == 2:  # If it's a link
        text, url = item
        print("Text:", text)
        print("URL:", url)
    elif len(item) == 5:  # If it's an input field
        print("Input")
        print("Label:", item[1])
        print("Type:", item[2])
        print("Name:", item[3])
        print("ID:", item[4])
    print("----------------------")


Text: stand with us!
URL: https://blog.archive.org/2023/03/25/the-fight-continues/
----------------------
Input
Label: 
Type: hidden
Name: 
ID: 
----------------------
Text: Internet Archive logo
A line drawing of the Internet Archive headquarters building façade.
URL: https://archive.org/
----------------------
Text: Donate icon
An illustration of a heart shape



"Donate to the archive"
URL: https://archive.org/donate/?origin=iawww-mbhrt
----------------------
Input
Label: 
Type: text
Name: query
ID: 
----------------------
Text: Upload icon
An illustration of a horizontal line over an up pointing arrow.


Upload
URL: https://archive.org/create
----------------------
Text: User icon
An illustration of a person's head and chest.
URL: 
----------------------
Text: Sign up
URL: https://archive.org/account/signup
----------------------
Text: Log in
URL: https://archive.org/account/login
----------------------
Text: Web icon
An illustration of a computer application window



Wayback Mach

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the Wayback Machine website
url = "https://archive.org/web/"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

# Find the navigation elements and input fields
navigation_elements = soup.find_all(["a", "input"])

# Initialize a string variable to store the extracted data
doc = ""

# Extract the text, URLs, and input field attributes
for element in navigation_elements:
    if element.name == "a":  # If it's a link
        text = element.get_text().strip()
        url = element.get("href", "").strip()  # Get the "href" attribute or empty string if it doesn't exist
        doc += f"Text: {text}\nURL: {url}\n"
    elif element.name == "input":  # If it's an input field
        label = element.get("aria-label", "").strip()  # Get the "aria-label" attribute or empty string if it doesn't exist
        input_type = element.get("type", "").strip()  # Get the "type" attribute or empty string if it doesn't exist
        name = element.get("name", "").strip()  # Get the "name" attribute or empty string if it doesn't exist
        doc += f"Input\nLabel: {label}\nType: {input_type}\nName: {name}\n"
    doc += "----------------------\n"

# Print the extracted data
print(doc)


Text: stand with us!
URL: https://blog.archive.org/2023/03/25/the-fight-continues/
----------------------
Input
Label: 
Type: hidden
Name: 
----------------------
Text: Internet Archive logo
A line drawing of the Internet Archive headquarters building façade.
URL: https://archive.org/
----------------------
Text: Donate icon
An illustration of a heart shape



"Donate to the archive"
URL: https://archive.org/donate/?origin=iawww-mbhrt
----------------------
Input
Label: 
Type: text
Name: query
----------------------
Text: Upload icon
An illustration of a horizontal line over an up pointing arrow.


Upload
URL: https://archive.org/create
----------------------
Text: User icon
An illustration of a person's head and chest.
URL: 
----------------------
Text: Sign up
URL: https://archive.org/account/signup
----------------------
Text: Log in
URL: https://archive.org/account/login
----------------------
Text: Web icon
An illustration of a computer application window



Wayback Machine
URL: h