In [1]:
import requests  # Import requests to fetch web page content
from bs4 import BeautifulSoup  # Import BeautifulSoup to parse HTML
import json  # Import JSON to store the scraped data

# Define the URL to scrape
url = 'http://www.bu.edu/president/boston-university-facts-stats/'

# Send an HTTP request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all table elements in the page
    tables = soup.find_all('table')
    
    # Initialize an empty list to store table data
    data = []
    
    # Loop through each table found on the page
    for table in tables:
        table_data = []  # Store individual table rows
        
        # Find all rows in the table
        rows = table.find_all('tr')
        
        # Loop through each row
        for row in rows:
            cols = row.find_all(['th', 'td'])  # Extract headers and data columns
            cols = [col.text.strip() for col in cols]  # Clean text by stripping whitespace
            table_data.append(cols)  # Append the cleaned row data
        
        data.append(table_data)  # Add table data to the main data list
    
    # Save the scraped data as a JSON file
    with open('bu_facts_stats.json', 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)
    
    print("Data successfully scraped and saved to 'bu_facts_stats.json'")
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

Data successfully scraped and saved to 'bu_facts_stats.json'


In [2]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the UCI datasets page
url = "https://archive.ics.uci.edu/ml/datasets.php"

# Send an HTTP GET request
headers = {"User-Agent": "Mozilla/5.0"}  # Some sites block bots, so a User-Agent is added
response = requests.get(url, headers=headers)

# Check if request was successful
if response.status_code == 200:
    # Parse the page content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the main table containing dataset information
    table = soup.find("table", {"border": "1"})  # The dataset table has a border attribute
    
    # Extract table headers
    headers = [header.text.strip() for header in table.find_all("th")]
    
    # Extract table rows
    datasets = []
    for row in table.find_all("tr")[1:]:  # Skipping the header row
        columns = row.find_all("td")
        data = {headers[i]: columns[i].text.strip() for i in range(len(columns))}
        datasets.append(data)
    
    # Save data to a JSON file
    with open("uci_datasets.json", "w", encoding="utf-8") as file:
        json.dump(datasets, file, indent=4)
    
    print("Data successfully extracted and saved to uci_datasets.json")
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

Failed to retrieve the webpage. Status code: 404


In [3]:
import requests
from bs4 import BeautifulSoup
import json

# URL of the Wikipedia page containing the list of U.S. presidents
url = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"

# Send an HTTP GET request to the URL
headers = {"User-Agent": "Mozilla/5.0"}  # Setting user-agent to avoid blocks
response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    # Parse the webpage content using BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Locate the first table that contains the list of presidents
    table = soup.find("table", {"class": "wikitable"})
    
    # Extract table headers
    headers = [th.text.strip() for th in table.find_all("th")]
    
    # Extract table rows
    rows = []
    for row in table.find_all("tr")[1:]:  # Skip the header row
        columns = row.find_all("td")
        if len(columns) > 0:
            data = [col.text.strip() for col in columns]
            rows.append(dict(zip(headers, data)))
    
    # Save the extracted data into a JSON file
    with open("us_presidents.json", "w", encoding="utf-8") as file:
        json.dump(rows, file, indent=4)
    
    print("Scraped data saved to us_presidents.json")
else:
    print("Failed to retrieve the webpage. Status Code:", response.status_code)


Scraped data saved to us_presidents.json
