## Exercises: Day 20

### 1.

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.corpus import stopwords


nltk.download('stopwords')

def get_text_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup.get_text()

# Function to find the 10 most frequent words
def most_frequent_words(text, top_n=10):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    word_counter = Counter(words)
    most_common = word_counter.most_common(top_n)
    
    return most_common

# URL for Romeo and Juliet text
romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'

# Get text from the URL
romeo_and_juliet_text = get_text_from_url(romeo_and_juliet_url)

# Find the 10 most frequent words
most_common_words = most_frequent_words(romeo_and_juliet_text, top_n=10)

# Print the result
print("10 Most Frequent Words:")
for word, count in most_common_words:
    print(f"{word}: {count}")


ModuleNotFoundError: No module named 'bs4'

### 2.

In [6]:
import requests
import numpy as np
from statistics import mean, median, stdev
from collections import Counter

def get_cats_data(api_url):
    response = requests.get(api_url)
    cats_data = response.json()
    return cats_data

cats_api = 'https://api.thecatapi.com/v1/breeds'

cats_data = get_cats_data(cats_api)

weights = []
lifespans = []
country_breed_freq = Counter()

for cat in cats_data:
    if 'weight' in cat:
        weight_metric = cat['weight']['metric']
        weight_value = float(weight_metric.split()[0])  
        weights.append(weight_value)

    if 'life_span' in cat:
        lifespan_years = cat['life_span']
        if '-' in lifespan_years:
            lifespan_values = [float(value) for value in lifespan_years.split('-')]
            lifespan_years = mean(lifespan_values)
        else:
            lifespan_years = float(lifespan_years)
        lifespans.append(lifespan_years)

    
    if 'origin' in cat and 'name' in cat:
        country_breed_freq[(cat['origin'], cat['name'])] += 1


weight_stats = {
    'min': min(weights),
    'max': max(weights),
    'mean': mean(weights),
    'median': median(weights),
    'std_dev': stdev(weights)
}

lifespan_stats = {
    'min': min(lifespans),
    'max': max(lifespans),
    'mean': mean(lifespans),
    'median': median(lifespans),
    'std_dev': stdev(lifespans)
}

print("Statistics for Cat Weights (in metric units):")
for stat, value in weight_stats.items():
    print(f"{stat}: {value:.2f}")

print("\nStatistics for Cat Lifespans (in years):")
for stat, value in lifespan_stats.items():
    print(f"{stat}: {value:.2f}")

print("\nFrequency Table of Country and Breed:")
for (country, breed), freq in country_breed_freq.items():
    print(f"{country} - {breed}: {freq} times")


Statistics for Cat Weights (in metric units):
min: 2.00
max: 5.00
mean: 3.22
median: 3.00
std_dev: 0.88

Statistics for Cat Lifespans (in years):
min: 10.50
max: 19.00
mean: 13.75
median: 13.50
std_dev: 1.58

Frequency Table of Country and Breed:
Egypt - Abyssinian: 1 times
Greece - Aegean: 1 times
United States - American Bobtail: 1 times
United States - American Curl: 1 times
United States - American Shorthair: 1 times
United States - American Wirehair: 1 times
United Arab Emirates - Arabian Mau: 1 times
Australia - Australian Mist: 1 times
United States - Balinese: 1 times
United States - Bambino: 1 times
United States - Bengal: 1 times
France - Birman: 1 times
United States - Bombay: 1 times
United Kingdom - British Longhair: 1 times
United Kingdom - British Shorthair: 1 times
Burma - Burmese: 1 times
United Kingdom - Burmilla: 1 times
United States - California Spangled: 1 times
United States - Chantilly-Tiffany: 1 times
France - Chartreux: 1 times
Egypt - Chausie: 1 times
United 

### 3.

### i.


In [12]:
import requests

countries_api = 'https://restcountries.com/v2/all'


response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Sort countries based on total area in descending order
    sorted_countries = sorted(countries_data, key=lambda x: x.get('area', 0), reverse=True)

    # Get the top 10 largest countries
    top_10_largest_countries = sorted_countries[:10]

    # Print the list of 10 largest countries
    print("Top 10 Largest Countries:")
    print("{:<3} {:<40} {:<15}".format("Rank", "Country", "Total Area (sq km)"))
    print("="*60)
    for i, country in enumerate(top_10_largest_countries, start=1):
        country_name = country.get('name', '')
        total_area = country.get('area', 0)
        print("{:<3} {:<40} {:<15}".format(i, country_name, total_area))

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")

Top 10 Largest Countries:
Rank Country                                  Total Area (sq km)
1   Russian Federation                       17124442.0     
2   Antarctica                               14000000.0     
3   Canada                                   9984670.0      
4   China                                    9640011.0      
5   United States of America                 9629091.0      
6   Brazil                                   8515767.0      
7   Australia                                7692024.0      
8   India                                    3287590.0      
9   Argentina                                2780400.0      
10  Kazakhstan                               2724900.0      


### ii.

In [13]:
import requests

# URL of the countries API
countries_api = 'https://restcountries.com/v2/all'

# Send a GET request to the API
response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Extract language names from all countries
    all_languages = [language['name'] for country in countries_data for language in country.get('languages', [])]

    # Create a frequency table for languages
    language_frequency = {}
    for language in all_languages:
        language_frequency[language] = language_frequency.get(language, 0) + 1

    # Sort languages based on frequency in descending order
    sorted_languages = sorted(language_frequency.items(), key=lambda x: x[1], reverse=True)

    # Get the top 10 most spoken languages
    top_10_languages = sorted_languages[:10]

    # Print the list of 10 most spoken languages
    print("Top 10 Most Spoken Languages:")
    print("{:<3} {:<25} {:<10}".format("Rank", "Language", "Frequency"))
    print("="*50)
    for i, (language, frequency) in enumerate(top_10_languages, start=1):
        print("{:<3} {:<25} {:<10}".format(i, language, frequency))

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")

Top 10 Most Spoken Languages:
Rank Language                  Frequency 
1   English                   91        
2   French                    45        
3   Arabic                    25        
4   Spanish                   24        
5   Portuguese                10        
6   Russian                   8         
7   Dutch                     8         
8   German                    7         
9   Chinese                   5         
10  Serbian                   4         


### iii.

In [14]:
import requests

# URL of the countries API
countries_api = 'https://restcountries.com/v2/all'

# Send a GET request to the API
response = requests.get(countries_api)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Get the JSON data from the response
    countries_data = response.json()

    # Extract language codes from all countries
    all_languages = [language.get('iso639_1', None) for country in countries_data for language in country.get('languages', [])]

    # Remove None values
    all_languages = [lang for lang in all_languages if lang is not None]

    # Get the total number of unique languages
    total_languages = len(set(all_languages))

    print(f"Total number of languages in the countries API: {total_languages}")

else:
    print(f"Error: Unable to retrieve data from the countries API. Status code: {response.status_code}")

Total number of languages in the countries API: 112


### 4.

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the UCI Machine Learning Repository
url = 'https://archive.ics.uci.edu/ml/datasets.php'

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content of the page using BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find and print the content of the page (you can customize this based on your needs)
    content = soup.prettify()  # You can use soup.text for plain text without indentation
    print(content)

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


Failed to retrieve the page. Status code: 404
