In [None]:
import psycopg2
import json
from typing import Dict
import os
from dotenv import load_dotenv

# Lade Umgebungsvariablen
load_dotenv()


def connect_to_db() -> psycopg2.extensions.connection:
    return psycopg2.connect(
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        database=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
    )


def get_category_counts(cur: psycopg2.extensions.cursor) -> Dict[str, int]:
    """Ermittelt die Anzahl der verfügbaren Einträge pro DDC-Kategorie"""
    cur.execute(
        """
        SELECT SUBSTRING(ddc FROM 1 FOR 1) as ddc_category, COUNT(*) as count
        FROM dnb_records
        WHERE num_pages <= 200 
            AND abstract_num IS NOT NULL 
            AND abstract_num != '0'
            AND id NOT IN (SELECT id FROM dnb_records_subset)
        GROUP BY ddc_category
        ORDER BY ddc_category
    """
    )
    return {str(row[0]): row[1] for row in cur.fetchall()}


def create_balanced_subset(category_counts: Dict[str, int], total_needed: int) -> Dict[str, int]:
    """
    Erstellt ein ausgewogenes Subset der Daten durch rotierende Auswahl.
    Verteilt die Einträge gleichmäßig auf alle Kategorien, die noch Einträge haben.
    """
    entries_needed = {ddc: 0 for ddc in category_counts.keys()}
    remaining_entries = {ddc: count for ddc, count in category_counts.items()}
    entries_to_allocate = total_needed

    while entries_to_allocate > 0:
        # Nur Kategorien berücksichtigen, die noch Einträge haben
        available_categories = [ddc for ddc, count in remaining_entries.items() if count > 0]

        if not available_categories:
            break

        # Berechne, wie viele Einträge pro Kategorie in dieser Runde verteilt werden
        entries_per_category = max(1, entries_to_allocate // len(available_categories))

        for ddc in available_categories:
            # Nimm den kleineren Wert: verfügbare Einträge oder zu verteilende Einträge
            entries_to_take = min(remaining_entries[ddc], entries_per_category, entries_to_allocate)

            entries_needed[ddc] += entries_to_take
            remaining_entries[ddc] -= entries_to_take
            entries_to_allocate -= entries_to_take

            if entries_to_allocate <= 0:
                break

    return entries_needed


def extend_subset_table(cur: psycopg2.extensions.cursor, additional_entries: int) -> None:
    """Erweitert die bestehende Subset-Tabelle um weitere Einträge"""
    # Aktuelle Verteilung ermitteln und anzeigen
    cur.execute(
        """
        SELECT SUBSTRING(ddc FROM 1 FOR 1) as ddc_category, COUNT(*) as count
        FROM dnb_records_subset
        GROUP BY ddc_category
        ORDER BY ddc_category
    """
    )
    print("Aktuelle Verteilung:")
    for row in cur.fetchall():
        print(f"DDC {row[0]}: {row[1]} Einträge")

    # Neue Verteilung berechnen
    category_counts = get_category_counts(cur)
    additional_needed = create_balanced_subset(category_counts, additional_entries)

    print("\nGeplante zusätzliche Einträge:")
    for cat, count in sorted(additional_needed.items()):
        print(f"DDC {cat}: +{count} Einträge")

    # Zusätzliche Einträge einfügen
    cur.execute(
        """
        INSERT INTO dnb_records_subset
        WITH ranked_records AS (
            SELECT *,
                ROW_NUMBER() OVER (
                    PARTITION BY SUBSTRING(ddc FROM 1 FOR 1)
                    ORDER BY RANDOM()
                ) as row_num
            FROM dnb_records r
            WHERE num_pages <= 200 
                AND abstract_num IS NOT NULL 
                AND abstract_num != '0'
                AND NOT EXISTS (
                    SELECT 1 FROM dnb_records_subset s
                    WHERE s.id = r.id
                )
        )
        SELECT * FROM ranked_records r
        WHERE row_num <= (
            SELECT count::integer
            FROM jsonb_each_text(%s) as t(ddc, count)
            WHERE t.ddc = SUBSTRING(r.ddc FROM 1 FOR 1)
        )
    """,
        (json.dumps(additional_needed),),
    )


def main():
    conn = connect_to_db()
    cur = conn.cursor()

    try:
        print("Erweitere Subset-Tabelle...")
        extend_subset_table(cur, 80)

        # Überprüfe das erweiterte Subset
        cur.execute(
            """
            SELECT COUNT(*) FROM dnb_records_subset
        """
        )
        total_count = cur.fetchone()[0]
        print(f"\nNeue Gesamtanzahl: {total_count}")

        # Neue Verteilung überprüfen
        cur.execute(
            """
            SELECT SUBSTRING(ddc FROM 1 FOR 1) as ddc_category, COUNT(*) as count
            FROM dnb_records_subset
            GROUP BY ddc_category
            ORDER BY ddc_category
        """
        )
        print("\nNeue Verteilung:")
        for row in cur.fetchall():
            print(f"DDC {row[0]}: {row[1]} Einträge")

        conn.commit()
        print("\nErweiterung erfolgreich abgeschlossen!")

    finally:
        cur.close()
        conn.close()


if __name__ == "__main__":
    main()