In [7]:
import requests
from collections import Counter
import re
from bs4 import BeautifulSoup

def most_frequent_words(url, top_n=10):
    """
    Fetches and processes text from a given URL, removes HTML tags, filters metadata, 
    and finds the most frequent words.
    
    Parameters:
    url (str): The URL of the text source.
    top_n (int): Number of most common words to return.
    
    Returns:
    list: A list of tuples with the most common words and their frequencies.
    """
    
    response = requests.get(url)  # Get the text from the URL
    html_content = response.text  # Extract raw HTML content

    # Use BeautifulSoup to remove HTML tags and keep only visible text
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text(separator=" ")  # Extract text while preserving spacing

    # Gutenberg books have metadata at the beginning and end
    start_marker = "*** START OF THIS PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***"
    end_marker = "*** END OF THIS PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***"

    # Find where the book starts and ends
    start_idx = text.find(start_marker)
    end_idx = text.find(end_marker)

    if start_idx != -1 and end_idx != -1:
        text = text[start_idx + len(start_marker):end_idx]  # Keep only book content

    # Use regex to extract words (removing numbers, punctuation, and extra spaces)
    words = re.findall(r'\b[a-zA-Z]{2,}\b', text.lower())  # Ensures only meaningful words

    # Count occurrences and return the top N words
    word_counts = Counter(words).most_common(top_n)
    
    return word_counts

# URL of "Romeo and Juliet" from Project Gutenberg
romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'

# Call the function and display the 10 most frequent words
print(most_frequent_words(romeo_and_juliet_url, 10))


[('to', 5), ('the', 5), ('gutenberg', 4), ('you', 4), ('project', 3), ('about', 3), ('contact', 3), ('of', 3), ('and', 3), ('help', 3)]


2. Analyze Cats API Data
We will:

Fetch the cat breeds data
Extract weight and lifespan values

Compute min, max, mean, median, and standard deviation

In [8]:
import requests  # Library for making HTTP requests
import numpy as np  # Library for numerical computations (not used here but typically useful)
import statistics  # Library for computing statistical measures

# API endpoint for fetching cat breed data
cats_api_url = 'https://api.thecatapi.com/v1/breeds'

def analyze_cats():
    """
    Fetches cat breed data from The Cat API, extracts weight, lifespan, and country frequency, 
    and computes statistical summaries for weight and lifespan.
    
    Returns:
        - weight_stats (dict): Min, max, mean, median, and standard deviation of cat weights
        - lifespan_stats (dict): Min, max, mean, median, and standard deviation of cat lifespan
        - country_counts (dict): Frequency table of cat breeds per country
    """
    
    response = requests.get(cats_api_url)  # Send GET request to the API
    cats = response.json()  # Convert JSON response to Python dictionary/list

    # Lists to store numerical data
    weights = []  # Stores average weights of breeds
    lifespans = []  # Stores average lifespans of breeds
    country_counts = {}  # Dictionary to count occurrences of each country

    # Loop through each cat breed in the API response
    for cat in cats:
        # Extract weight if available, convert to float
        if 'weight' in cat and 'metric' in cat['weight']:
            weight_range = list(map(float, cat['weight']['metric'].split(' - ')))  # Convert "3 - 5" to [3.0, 5.0]
            weights.append(statistics.mean(weight_range))  # Store the average weight

        # Extract lifespan if available, convert to float
        if 'life_span' in cat:
            lifespan_range = list(map(float, cat['life_span'].split(' - ')))  # Convert "12 - 15" to [12.0, 15.0]
            lifespans.append(statistics.mean(lifespan_range))  # Store the average lifespan

        # Count occurrences of each country of origin
        country = cat.get('origin', 'Unknown')  # Get country of origin or set as 'Unknown'
        country_counts[country] = country_counts.get(country, 0) + 1  # Increment count

    # Compute statistical measures for weight
    weight_stats = {
        "min": min(weights),  # Minimum weight
        "max": max(weights),  # Maximum weight
        "mean": statistics.mean(weights),  # Average weight
        "median": statistics.median(weights),  # Middle value
        "std_dev": statistics.stdev(weights),  # Standard deviation
    }

    # Compute statistical measures for lifespan
    lifespan_stats = {
        "min": min(lifespans),  # Minimum lifespan
        "max": max(lifespans),  # Maximum lifespan
        "mean": statistics.mean(lifespans),  # Average lifespan
        "median": statistics.median(lifespans),  # Middle value
        "std_dev": statistics.stdev(lifespans),  # Standard deviation
    }

    # Return the computed statistics and country frequency table
    return weight_stats, lifespan_stats, country_counts

# Call the function to analyze cat data
weights, lifespans, country_counts = analyze_cats()

# Print results
print("Weight Stats:", weights)  # Display weight statistics
print("Lifespan Stats:", lifespans)  # Display lifespan statistics
print("Country Frequency:", country_counts)  # Display frequency of cat breeds by country


Weight Stats: {'min': 3.0, 'max': 7.5, 'mean': 4.708955223880597, 'median': 4.5, 'std_dev': 1.066533799956462}
Lifespan Stats: {'min': 10.5, 'max': 19.0, 'mean': 13.746268656716419, 'median': 13.5, 'std_dev': 1.5844249849048053}
Country Frequency: {'Egypt': 3, 'Greece': 1, 'United States': 28, 'United Arab Emirates': 1, 'Australia': 1, 'France': 2, 'United Kingdom': 8, 'Burma': 2, 'Canada': 3, 'Cyprus': 1, 'Russia': 4, 'China': 1, 'Japan': 1, 'Thailand': 4, 'Isle of Man': 1, 'Norway': 1, 'Iran (Persia)': 1, 'Singapore': 1, 'Somalia': 1, 'Turkey': 2}


3. Analyze Countries API Data
We will:

Fetch country data

Find the 10 largest countries

Find the 10 most spoken languages

Count total languages

In [9]:
import requests  # Import requests to fetch API data

def analyze_countries():
    """
    Fetches country data from the REST Countries API and analyzes:
    - The 10 largest countries by population
    - The 10 most spoken languages globally
    - The total number of unique languages spoken
    
    Returns:
        - largest_countries (list of dicts): Top 10 countries with their population
        - most_spoken_languages (list of tuples): Top 10 languages with their frequency
        - total_languages (int): Total number of unique languages spoken worldwide
    """

    # API URL for retrieving country data
    countries_api_url = 'https://restcountries.com/v3.1/all'

    # Send a GET request to fetch country data
    response = requests.get(countries_api_url)
    countries = response.json()  # Convert the API response to a Python list

    # Extract the 10 largest countries by population
    largest_countries = sorted(
        countries, 
        key=lambda c: c.get('population', 0),  # Use 0 as default if 'population' is missing
        reverse=True  # Sort in descending order
    )[:10]  # Take the top 10

    # Format the largest countries into a list of dictionaries
    largest_countries = [
        {'country': c['name']['common'], 'population': c['population']} 
        for c in largest_countries
    ]

    # Dictionary to count occurrences of languages
    language_counts = {}

    # Set to store unique languages
    all_languages = set()

    # Loop through each country in the dataset
    for country in countries:
        languages = country.get('languages', {})  # Get the 'languages' dictionary, default to empty
        for lang in languages.values():  # Iterate through language names
            language_counts[lang] = language_counts.get(lang, 0) + 1  # Count occurrences
            all_languages.add(lang)  # Store unique languages

    # Get the 10 most spoken languages (sorted by occurrence count)
    most_spoken_languages = sorted(
        language_counts.items(), 
        key=lambda x: x[1],  # Sort by frequency
        reverse=True  # Sort in descending order
    )[:10]  # Take the top 10

    # Return results: largest countries, most spoken languages, and total unique languages
    return largest_countries, most_spoken_languages, len(all_languages)

# Call the function to analyze country data
largest_countries, top_languages, total_languages = analyze_countries()

# Display results
print("10 Largest Countries by Population:", largest_countries)
print("10 Most Spoken Languages:", top_languages)
print("Total Number of Unique Languages:", total_languages)


10 Largest Countries by Population: [{'country': 'China', 'population': 1402112000}, {'country': 'India', 'population': 1380004385}, {'country': 'United States', 'population': 329484123}, {'country': 'Indonesia', 'population': 273523621}, {'country': 'Pakistan', 'population': 220892331}, {'country': 'Brazil', 'population': 212559409}, {'country': 'Nigeria', 'population': 206139587}, {'country': 'Bangladesh', 'population': 164689383}, {'country': 'Russia', 'population': 144104080}, {'country': 'Mexico', 'population': 128932753}]
10 Most Spoken Languages: [('English', 91), ('French', 46), ('Arabic', 25), ('Spanish', 24), ('Portuguese', 10), ('Dutch', 7), ('Russian', 7), ('German', 6), ('Chinese', 5), ('Italian', 4)]
Total Number of Unique Languages: 155


4. Scrape UCI Machine Learning Repository
We will:

Fetch and parse the webpage

Extract dataset names

In [28]:
import requests
from bs4 import BeautifulSoup

# Function to scrape dataset names from UCI Machine Learning Repository
def scrape_uci_datasets():
    """
    Scrapes dataset names from the UCI Machine Learning Repository.
    
    Returns:
        list: A list of the first 10 dataset names.
    """
    uci_url = 'https://archive.ics.uci.edu/ml/datasets.php'  # UCI dataset URL
    headers = {'User-Agent': 'Mozilla/5.0'}  # Set headers to avoid bot blocking
    
    try:
        response = requests.get(uci_url, headers=headers, timeout=10)
        response.raise_for_status()  # Raise error if request fails
    except requests.Timeout:
        return ["Error: Request timed out"]
    except requests.RequestException as e:
        return [f"Request failed: {e}"]
    
    # Parse HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract dataset names
    datasets = []
    tables = soup.find_all("table")  # Find all tables on the page
    if tables:
        for table in tables:
            for row in table.find_all("tr")[1:]:  # Skip header row
                columns = row.find_all("td")
                if columns and columns[0].a:  # Ensure the first column has a link
                    dataset_name = columns[0].a.text.strip()
                    datasets.append(dataset_name)
    
    return datasets[:10] if datasets else ["No datasets found"]

# Display the scraped datasets
print("UCI Datasets:", scrape_uci_datasets())

UCI Datasets: ['Request failed: 404 Client Error: Not Found for url: https://archive.ics.uci.edu/datasets.php']


In [20]:
!pip install selenium webdriver-manager



Defaulting to user installation because normal site-packages is not writeable


This Python exercise focuses on retrieving and analyzing various types of data. (Explains the purpose of the task) It includes processing text from Romeo and Juliet to identify frequently used words. (Summarizes the text analysis task) Additionally, it involves interacting with APIs, such as the Cats API, to compute key statistics on cat weights, lifespans, and breed distributions. (Describes working with APIs and extracting meaningful insights) The Countries API is also explored to determine the largest nations, most commonly spoken languages, and the total number of languages recorded. (Highlights geographic and linguistic data analysis) Lastly, the exercise introduces web scraping using BeautifulSoup4 to collect dataset information from the UCI Machine Learning Repository. (Explains the web scraping task) Overall, these tasks strengthen skills in data collection, structured and unstructured data handling, API usage, statistical calculations, and web scraping. (Emphasizes the skill-building aspect of the exercise)