## Exercises: Day 20

### 1.

In [4]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import nltk
from nltk.corpus import stopwords

romeo_and_juliet_url = 'http://www.gutenberg.org/files/1112/1112.txt'


response = requests.get(romeo_and_juliet_url)


if response.status_code == 200:

    soup = BeautifulSoup(response.content, 'html.parser')

    text = soup.get_text()

    words = nltk.word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    word_freq = Counter(words)

    most_common_words = word_freq.most_common(10)

    print("The 10 most frequent words in Romeo and Juliet:")
    for word, frequency in most_common_words:
        print(f"{word}: {frequency}")

else:
    print('Failed to retrieve data from the URL. Status code:', response.status_code)


Failed to retrieve data from the URL. Status code: 404


### 2.

In [2]:
import requests
import numpy as np
from statistics import mean, median, stdev
from collections import Counter

def get_cats_data(api_url):
    response = requests.get(api_url)
    cats_data = response.json()
    return cats_data

cats_api = 'https://api.thecatapi.com/v1/breeds'

cats_data = get_cats_data(cats_api)

weights = []
lifespans = []
country_breed_freq = Counter()

for cat in cats_data:
    if 'weight' in cat:
        weight_metric = cat['weight']['metric']
        weight_value = float(weight_metric.split()[0])  
        weights.append(weight_value)

    if 'life_span' in cat:
        lifespan_years = cat['life_span']
        if '-' in lifespan_years:
            lifespan_values = [float(value) for value in lifespan_years.split('-')]
            lifespan_years = mean(lifespan_values)
        else:
            lifespan_years = float(lifespan_years)
        lifespans.append(lifespan_years)

    
    if 'origin' in cat and 'name' in cat:
        country_breed_freq[(cat['origin'], cat['name'])] += 1


weight_stats = {
    'min': min(weights),
    'max': max(weights),
    'mean': mean(weights),
    'median': median(weights),
    'std_dev': stdev(weights)
}

lifespan_stats = {
    'min': min(lifespans),
    'max': max(lifespans),
    'mean': mean(lifespans),
    'median': median(lifespans),
    'std_dev': stdev(lifespans)
}

print("Statistics for Cat Weights (in metric units):")
for stat, value in weight_stats.items():
    print(f"{stat}: {value:.2f}")

print("\nStatistics for Cat Lifespans (in years):")
for stat, value in lifespan_stats.items():
    print(f"{stat}: {value:.2f}")

print("\nFrequency Table of Country and Breed:")
for (country, breed), freq in country_breed_freq.items():
    print(f"{country} - {breed}: {freq} times")


Statistics for Cat Weights (in metric units):
min: 2.00
max: 5.00
mean: 3.22
median: 3.00
std_dev: 0.88

Statistics for Cat Lifespans (in years):
min: 10.50
max: 19.00
mean: 13.75
median: 13.50
std_dev: 1.58

Frequency Table of Country and Breed:
Egypt - Abyssinian: 1 times
Greece - Aegean: 1 times
United States - American Bobtail: 1 times
United States - American Curl: 1 times
United States - American Shorthair: 1 times
United States - American Wirehair: 1 times
United Arab Emirates - Arabian Mau: 1 times
Australia - Australian Mist: 1 times
United States - Balinese: 1 times
United States - Bambino: 1 times
United States - Bengal: 1 times
France - Birman: 1 times
United States - Bombay: 1 times
United Kingdom - British Longhair: 1 times
United Kingdom - British Shorthair: 1 times
Burma - Burmese: 1 times
United Kingdom - Burmilla: 1 times
United States - California Spangled: 1 times
United States - Chantilly-Tiffany: 1 times
France - Chartreux: 1 times
Egypt - Chausie: 1 times
United 

### 3.

In [None]:
import requests


url = 'https://restcountries.eu/rest/v2/all'
response = requests.get(url)


if response.status_code == 200:
    countries_data = response.json()

    largest_countries = sorted(countries_data, key=lambda x: x['area'], reverse=True)[:10]
    print("\nTask 1: The 10 largest countries:")
    for country in largest_countries:
        print(f"{country['name']}: {country['area']} sq km")

    all_languages = [language for country in countries_data for language in country['languages']]
    language_counts = {language: all_languages.count(language) for language in set(all_languages)}
    most_spoken_languages = sorted(language_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    print("\nTask 2: The 10 most spoken languages:")
    for language, count in most_spoken_languages:
        print(f"{language}: {count} countries")

    total_languages = len(set(all_languages))
    print("\nTask 3: Total number of languages in the countries API:", total_languages)

else:
    print('Failed to retrieve data from the API. Status code:', response.status_code)


### 4.

In [None]:
import requests
from bs4 import BeautifulSoup

uci_url = 'https://archive.ics.uci.edu/ml/datasets.php'

def get_html_content(url):
    response = requests.get(url)
    return response.text

def extract_datasets_info(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    dataset_table = soup.find('table', {'border': '1'})
  
    datasets_info = []
    for row in dataset_table.find_all('tr')[1:]:  
        columns = row.find_all('td')
        if len(columns) >= 2:
            dataset_name = columns[0].text.strip()
            dataset_link = columns[0].find('a')['href']
            dataset_description = columns[1].text.strip()
            datasets_info.append({
                'name': dataset_name,
                'link': dataset_link,
                'description': dataset_description
            })
    
    return datasets_info

uci_html_content = get_html_content(uci_url)

uci_datasets_info = extract_datasets_info(uci_html_content)


for dataset_info in uci_datasets_info[:5]:
    print(f"Name: {dataset_info['name']}")
    print(f"Link: {dataset_info['link']}")
    print(f"Description: {dataset_info['description']}")
    print("-" * 50)

print(f"Total number of datasets: {len(uci_datasets_info)}")


ModuleNotFoundError: No module named 'bs4'