In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL of the Helmet Library website
BASE_URL = "https://helmet.finna.fi"

# English version of the library list page
LIBRARIES_URL = BASE_URL + "/OrganisationInfo/Home?lng=en-gb"

def get_library_links():
    """Extracts links to all individual library pages"""
    response = requests.get(LIBRARIES_URL)
    soup = BeautifulSoup(response.text, "html.parser")

    library_links = []
    for link in soup.select("div.orglist a"):
        href = link.get("href")
        if href and "OrganisationInfo" in href:
            library_links.append(BASE_URL + href)
    
    return library_links

def get_library_details(library_url):
    """Extracts details of an individual library"""
    response = requests.get(library_url)
    soup = BeautifulSoup(response.text, "html.parser")

    details = {"Name": "", "Address": "", "Opening Hours": "", "Contact": "", "Services": ""}

    # Extracting name
    name_tag = soup.select_one("h1")
    details["Name"] = name_tag.text.strip() if name_tag else "N/A"

    # Extracting address
    address_tag = soup.select_one(".organisation-address")
    details["Address"] = address_tag.text.strip() if address_tag else "N/A"

    # Extracting opening hours
    opening_hours_tag = soup.select_one(".opening-hours")
    details["Opening Hours"] = opening_hours_tag.text.strip() if opening_hours_tag else "N/A"

    # Extracting contact details
    contact_tag = soup.select_one(".organisation-contacts")
    details["Contact"] = contact_tag.text.strip() if contact_tag else "N/A"

    # Extracting services
    services_tag = soup.select_one(".organisation-services")
    details["Services"] = services_tag.text.strip() if services_tag else "N/A"

    return details

def main():
    library_links = get_library_links()
    all_libraries = []

    for link in library_links:
        print(f"Scraping: {link}")
        library_details = get_library_details(link)
        all_libraries.append(library_details)

    # Save to CSV
    df = pd.DataFrame(all_libraries)
    # df.to_csv("helmet_libraries.csv", index=False)
    print("Scraping complete. Data saved to helmet_libraries.csv")

if __name__ == "__main__":
    main()


In [20]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time

# Step 1: Load Extracted Library IDs
libraries_df = pd.read_csv("helmet_libraries.csv")

# Step 2: Define API URL Template
base_api_url = "https://helmet.finna.fi/AJAX/JSON?method=getOrganisationInfo&element=location-details&id=Helmet&locationId={}&sectors=&buildings="
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
    "Referer": "https://helmet.finna.fi/OrganisationInfo/Home?lng=en-gb",
    "X-Requested-With": "XMLHttpRequest"
}

detailed_libraries_data = []

# Step 3: Loop Through Each Library ID and Fetch Details
for _, row in libraries_df.iterrows():
    lib_id = int(row["Library ID"])  # Convert to integer if needed
    api_url = base_api_url.format(lib_id)
    
    response = requests.get(api_url, headers=headers)
    time.sleep(1)  # Avoid rate-limiting

    if response.status_code == 200:
        data = response.json()
        if "data" in data:
            library_info = data["data"]

            # Parse 'info' HTML
            soup_info = BeautifulSoup(library_info.get("info", ""), "html.parser")
            library_name = soup_info.find("h3", class_="location-title")
            library_name = library_name.text.strip() if library_name else "N/A"

            address = soup_info.find("span", class_="location-links address info-element")
            address = address.text.strip() if address else "N/A"

            opening_hours = soup_info.find("span", class_="opening-times")
            opening_hours = opening_hours.text.strip() if opening_hours else "N/A"

            contact_info = soup_info.find("a", href=lambda x: x and x.startswith("mailto:"))
            contact_info = contact_info.text.strip() if contact_info else "N/A"

            # Parse 'details' HTML for services
            soup_details = BeautifulSoup(library_info.get("details", ""), "html.parser")
            services_list = soup_details.find_all("div", class_="service-list")
            services = " | ".join([service.text.strip() for service in services_list]) if services_list else "N/A"

            # Store extracted data
            detailed_libraries_data.append({
                "Library Name": library_name,
                "Library ID": lib_id,
                "Address": address,
                "Opening Hours": opening_hours,
                "Contact Info": contact_info,
                "Services": services
            })
            print(f"✅ Retrieved data for {library_name}")
        else:
            print(f"⚠️ No data found for library ID: {lib_id}")
    else:
        print(f"❌ Failed to fetch data for library ID {lib_id}. Status Code: {response.status_code}")

# Step 4: Convert to DataFrame and Save
detailed_libraries_df = pd.DataFrame(detailed_libraries_data)
print(detailed_libraries_df.head())  # Show first few rows

# Save to CSV
# detailed_libraries_df.to_csv("helmet_library_details.csv", index=False)
# print("✅ Data saved as 'helmet_library_details.csv'")


✅ Retrieved data for Valittukirjasto: Arabianrannan kirjasto
✅ Retrieved data for Valittukirjasto: Auroran sairaalan potilaskirjasto
✅ Retrieved data for Valittukirjasto: Entressen kirjasto
✅ Retrieved data for Valittukirjasto: Etelä-Haagan kirjasto
✅ Retrieved data for Valittukirjasto: Hakunilan kirjasto
✅ Retrieved data for Valittukirjasto: Haukilahden kirjasto
✅ Retrieved data for Valittukirjasto: Keskustakirjasto Oodi
✅ Retrieved data for Valittukirjasto: Herttoniemen kirjasto
✅ Retrieved data for Valittukirjasto: Hiekkaharjun kirjasto
✅ Retrieved data for Valittukirjasto: Ison Omenan kirjasto
✅ Retrieved data for Valittukirjasto: Itäkeskuksen kirjasto
✅ Retrieved data for Valittukirjasto: Jakomäen kirjasto
✅ Retrieved data for Valittukirjasto: Jätkäsaaren kirjasto
✅ Retrieved data for Valittukirjasto: Kalajärven kirjasto
✅ Retrieved data for Valittukirjasto: Kalasataman kirjasto
✅ Retrieved data for Valittukirjasto: Kallion kirjasto
✅ Retrieved data for Valittukirjasto: Kannelmäen

In [21]:
detailed_libraries_df.head()

Unnamed: 0,Library Name,Library ID,Address,Opening Hours,Contact Info,Services
0,Valittukirjasto: Arabianrannan kirjasto,84921,"Hämeentie 135 A, 00560 Helsinki",Auki tänään 08–20,arabianrannan_kirjasto@hel.fi,E-lehtipalvelupiste \n\n\n\n ...
1,Valittukirjasto: Auroran sairaalan potilaskirj...,84849,"Nordenskiöldinkatu 20, rak. 3 B-rappu, 00250 H...",,auroran.potilaskirjasto@hel.fi,"Tietokoneet | E-kirjasto (e-kirjat, äänikirjat..."
2,Valittukirjasto: Entressen kirjasto,84787,"Siltakatu 11, 02770 Espoo",Auki tänään 08–20,kirjasto.entresse@espoo.fi,Avoinna lauantaisin \n\n ...
3,Valittukirjasto: Etelä-Haagan kirjasto,84878,"Isonnevantie 16 B, 00320 Helsinki",Auki tänään 08–20,etela-haagan_kirjasto@hel.fi,Avoinna lauantaisin \n\n\n ...
4,Valittukirjasto: Hakunilan kirjasto,84869,"Kimokuja 5, 01200 Vantaa",Auki tänään 08–20,hakunila.kirjasto@vantaa.fi,Avoinna lauantaisin \n\n\n\n ...
