In [None]:
import psycopg2
import json
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Load DDC categories
with open("../data/ddc/ddc-basic.json", "r", encoding="utf-8") as f:
    ddc_data = json.load(f)

# Extract first digit DDC categories
first_digit_ddc = {key: value["name"] for key, value in ddc_data.items() if len(key) == 1 and key.isdigit()}

# Define the list of DDC categories to sum
ddc_categories_to_sum = [0, 3, 5, 6, 7]


def connect_to_db():
    return psycopg2.connect(
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        database=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
    )


def get_abstract_counts(cur, page_limit=None):
    where_clause = "WHERE abstract_num != 0"
    if page_limit is not None:
        where_clause += f" AND num_pages <= {page_limit}"

    cur.execute(f"SELECT COUNT(*) FROM dnb_records {where_clause};")
    count = cur.fetchone()[0]

    cur.execute(
        f"SELECT COUNT(*) FROM dnb_records WHERE abstract_num IS NOT NULL {' AND num_pages <= ' + str(page_limit) if page_limit is not None else ''};"
    )
    count_null = cur.fetchone()[0]

    percentage = count / count_null if count_null > 0 else 0

    return count, count_null, percentage


def get_ddc_counts(cur, page_limit=None):
    where_clause = "WHERE ddc IS NOT NULL"
    if page_limit is not None:
        where_clause += f" AND num_pages <= {page_limit}"

    cur.execute(
        f"""
        SELECT 
            SUBSTRING(ddc, 1, 1) AS ddc_first_digit, 
            COUNT(*) AS total_count,
            SUM(CASE WHEN abstract_num != 0 THEN 1 ELSE 0 END) AS count_with_abstract
        FROM dnb_records 
        {where_clause}
        GROUP BY SUBSTRING(ddc, 1, 1) 
        ORDER BY ddc_first_digit;
    """
    )
    return cur.fetchall()


def get_ddc_sum(cur, ddc_categories, page_limit=None):
    where_clause = f"WHERE SUBSTRING(ddc, 1, 1) ~ '^[{''.join(map(str, ddc_categories))}]$'"
    if page_limit is not None:
        where_clause += f" AND num_pages <= {page_limit}"

    cur.execute(
        f"""
        SELECT 
            COUNT(*) AS total_count,
            SUM(CASE WHEN abstract_num != 0 THEN 1 ELSE 0 END) AS count_with_abstract
        FROM dnb_records 
        {where_clause};
    """
    )
    return cur.fetchone()


def print_results(count, count_null, percentage, ddc_counts, ddc_sum, page_limit=None):
    limit_text = f" (num_pages <= {page_limit})" if page_limit is not None else ""
    print(
        f"Overall not null{limit_text}: {count_null} records; With abstract (count not 0): {count} ({percentage:.2%})"
    )

    print(f"\nGrouped by DDC first digit{limit_text}:")
    for ddc, total_count, count_with_abstract in ddc_counts:
        if total_count > 0:
            percentage = count_with_abstract / total_count
            category = first_digit_ddc.get(ddc, "Unknown")
            print(f"DDC {ddc}: ({count_with_abstract}/{total_count} with abstract) = {percentage:.2%} => {category}")
        else:
            print(f"DDC {ddc}: No records")

    total_count, count_with_abstract = ddc_sum
    if total_count > 0:
        percentage = count_with_abstract / total_count
        print(f"\nSum of DDC categories {ddc_categories_to_sum}{limit_text}:")
        print(f"({count_with_abstract}/{total_count} with abstract) = {percentage:.2%}")
    else:
        print(f"\nSum of DDC categories {ddc_categories_to_sum}{limit_text}: No records")


def main():
    conn = connect_to_db()
    cur = conn.cursor()

    # All records
    count, count_null, percentage = get_abstract_counts(cur)
    ddc_counts = get_ddc_counts(cur)
    ddc_sum = get_ddc_sum(cur, ddc_categories_to_sum)
    print_results(count, count_null, percentage, ddc_counts, ddc_sum)

    print("\n" + "=" * 50 + "\n")

    # Records with num_pages <= 200
    count, count_null, percentage = get_abstract_counts(cur, page_limit=200)
    ddc_counts = get_ddc_counts(cur, page_limit=200)
    ddc_sum = get_ddc_sum(cur, ddc_categories_to_sum, page_limit=200)
    print_results(count, count_null, percentage, ddc_counts, ddc_sum, page_limit=200)

    cur.close()
    conn.close()


if __name__ == "__main__":
    main()