In [None]:
# count records and check if they have a non-null url_dnb_archive

import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect("../downloader/dnb_records.db")

# Query to get the total number of records
total_records_query = "SELECT COUNT(*) FROM dnb_records"
total_records = pd.read_sql_query(total_records_query, conn).iloc[0, 0]

# Query to get the number of records with non-null url_dnb_archive
non_null_archive_query = (
    "SELECT COUNT(*) FROM dnb_records WHERE url_dnb_archive IS NOT NULL"
)
non_null_archive_records = pd.read_sql_query(non_null_archive_query, conn).iloc[0, 0]

# Close the connection
conn.close()

# Print the results
print(f"Total number of records: {total_records}")
print(f"Number of records with non-null url_dnb_archive: {non_null_archive_records}")
print(
    f"Percentage of records with non-null url_dnb_archive: {(non_null_archive_records / total_records) * 100:.2f}%"
)

In [None]:
# select 5 random records with non-null url_dnb_archive

import sqlite3
import pandas as pd
import requests
from urllib.parse import unquote

# Connect to the SQLite database
conn = sqlite3.connect("../downloader/dnb_records.db")

# Query to get 5 random non-null url_dnb_archive links
query = """
SELECT url_dnb_archive, title, publication_year, year
FROM dnb_records 
WHERE url_dnb_archive IS NOT NULL 
ORDER BY RANDOM() 
LIMIT 5
"""
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()


# Function to check URL and get file info
def check_url_and_get_info(url):
    try:
        response = requests.head(url, timeout=10, allow_redirects=True)
        if response.status_code == 200:
            content_type = response.headers.get("Content-Type", "")
            if "application/pdf" in content_type:
                size = int(response.headers.get("Content-Length", 0))
                filename = unquote(url.split("/")[-1])
                return "Online", size, filename
            else:
                return "Online (Not PDF)", 0, ""
        else:
            return f"Offline (Status: {response.status_code})", 0, ""
    except requests.RequestException as e:
        return f"Error: {str(e)}", 0, ""

print("Analyzing 5 random records with non-null url_dnb_archive: \n")

# Check each URL and print the results
for index, row in df.iterrows():
    url = row["url_dnb_archive"]
    title = row["title"]
    publication_year = row["publication_year"]
    year = row["year"]
    status, size, filename = check_url_and_get_info(url)
    print(f"{index + 1}) Title: {title}")
    print(f"Year: {year} (publication year: {publication_year})")
    print(f"URL: {url}")
    print(f"Status: {status}")
    if size > 0:
        print(f"File size: {size / (1024 * 1024):.2f} MB")
    print()