In [None]:
# count records and check if they have a non-null url_dnb_archive

import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("../downloader/dnb_records.db")

# Query to get the total number of records
total_records_query = "SELECT COUNT(*) FROM dnb_records"
total_records = pd.read_sql_query(total_records_query, conn).iloc[0, 0]

# Query to get the number of records with non-null url_dnb_archive
non_null_archive_query = (
    "SELECT COUNT(*) FROM dnb_records WHERE url_dnb_archive IS NOT NULL"
)
non_null_archive_records = pd.read_sql_query(non_null_archive_query, conn).iloc[0, 0]

# Close the connection
conn.close()

# Print the results
print(f"Total number of records: {total_records}")
print(f"Number of records with non-null url_dnb_archive: {non_null_archive_records}")
print(
    f"Percentage of records with non-null url_dnb_archive: {(non_null_archive_records / total_records) * 100:.2f}%"
)

In [None]:
# select 5 random records with non-null url_dnb_archive

import sqlite3
import pandas as pd
import requests
from urllib.parse import unquote

# Connect to the SQLite database
conn = sqlite3.connect("../downloader/dnb_records.db")

# Query to get 5 random non-null url_dnb_archive links
query = """
SELECT url_dnb_archive, title, publication_year, year
FROM dnb_records 
WHERE url_dnb_archive IS NOT NULL 
ORDER BY RANDOM() 
LIMIT 5
"""
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()


# Function to check URL and get file info
def check_url_and_get_info(url):
    try:
        response = requests.head(url, timeout=10, allow_redirects=True)
        if response.status_code == 200:
            content_type = response.headers.get("Content-Type", "")
            if "application/pdf" in content_type:
                size = int(response.headers.get("Content-Length", 0))
                filename = unquote(url.split("/")[-1])
                return "Online", size, filename
            else:
                return "Online (Not PDF)", 0, ""
        else:
            return f"Offline (Status: {response.status_code})", 0, ""
    except requests.RequestException as e:
        return f"Error: {str(e)}", 0, ""

print("Analyzing 5 random records with non-null url_dnb_archive: \n")

# Check each URL and print the results
for index, row in df.iterrows():
    url = row["url_dnb_archive"]
    title = row["title"]
    publication_year = row["publication_year"]
    year = row["year"]
    status, size, filename = check_url_and_get_info(url)
    print(f"{index + 1}) Title: {title}")
    print(f"Year: {year} (publication year: {publication_year})")
    print(f"URL: {url}")
    print(f"Status: {status}")
    if size > 0:
        print(f"File size: {size / (1024 * 1024):.2f} MB")
    print()

In [None]:
import sqlite3
import json
from collections import Counter, defaultdict


def load_ddc_basic():
    ddc_basic_path = "../data/ddc/ddc-basic.json"
    with open(ddc_basic_path, "r", encoding="utf-8") as f:
        return json.load(f)


def get_german_category(ddc_number, ddc_basic):
    main_category = ddc_number[0]
    rounded_sub_category = ddc_number[:3].rjust(
        3, "0"
    )  # Ensure 3 digits, pad with zeros if needed
    rounded_sub_category = (
        f"{int(rounded_sub_category) // 10 * 10:03d}"  # Round down to nearest ten
    )

    if main_category in ddc_basic:
        main_name = ddc_basic[main_category]["name"]
        for key, value in ddc_basic[main_category]["sub"].items():
            if "-" in key:
                start, end = map(int, key.split("-"))
                if start <= int(rounded_sub_category) <= end:
                    return main_category, main_name, f"{key} {value['name']}"
            elif key == rounded_sub_category:
                return main_category, main_name, f"{key} {value['name']}"
        return (
            main_category,
            main_name,
            f"{rounded_sub_category} Unterkategorie nicht gefunden",
        )
    else:
        return "0", "Kategorie nicht gefunden", "000 Unterkategorie nicht gefunden"


# Connect to the SQLite database
conn = sqlite3.connect("../downloader/dnb_records.db")
cursor = conn.cursor()

# Execute SQL query to get all ddc numbers
query = "SELECT ddc FROM dnb_records WHERE ddc IS NOT NULL AND ddc != ''"
cursor.execute(query)

# Fetch all results
results = cursor.fetchall()

# Load DDC basic data
ddc_basic = load_ddc_basic()

# Group results
grouped_results = defaultdict(lambda: defaultdict(int))

for row in results:
    ddc_number = row[0][:3]  # Take only first 3 digits
    main_category, main_name, sub_category = get_german_category(ddc_number, ddc_basic)
    grouped_results[f"{main_category} {main_name}"][sub_category] += 1

# Close the connection
conn.close()

# Print the grouped results
print("Grouped results:")
for main_category, sub_categories in sorted(
    grouped_results.items(), key=lambda x: x[0].split()[0]
):
    total_count = sum(sub_categories.values())
    print(f"\n{main_category} ({total_count}x Exemplare)")

    for sub_category, count in sorted(
        sub_categories.items(),
        key=lambda x: x[0].split()[0] if x[0].split()[0].isdigit() else "999",
    ):
        print(f"    {sub_category} ({count} Exemplare)")

In [None]:
import plotly.express as px

names = []
parents = []
values = []

total_exemplare = 0

for category, subcategories in grouped_results.items():
    category_name = category.split(" ", 1)[1]  # Entfernt die Nummer am Anfang
    category_total = sum(subcategories.values())
    total_exemplare += category_total

    names.append(category_name)
    parents.append("")
    values.append(category_total)

    for subcategory, count in subcategories.items():
        names.append(subcategory)
        parents.append(category_name)
        values.append(count)

fig = px.treemap(
    names=names,
    parents=parents,
    values=values,
    title=f"DDC Kategorien und Unterkategorien (Gesamt: {total_exemplare:,} Exemplare mit DDC)",
    branchvalues="total",
)

fig.update_traces(root_color="lightgrey")
fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

# export to html
# fig.write_html("treemap.html")
