In [None]:
import os
import psycopg2
from dotenv import load_dotenv
import tiktoken

# Umgebungsvariablen laden
load_dotenv()

def connect_to_db() -> psycopg2.extensions.connection:
    """Datenbankverbindung herstellen"""
    return psycopg2.connect(
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        database=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
    )

# Initialize the tokenizer (using cl100k_base which is used for GPT-4)
tokenizer = tiktoken.get_encoding("cl100k_base")

# Verbindung zur Datenbank herstellen
conn = connect_to_db()
cur = conn.cursor()

# Neue Spalte für Token-Anzahl erstellen, falls sie noch nicht existiert
cur.execute("""
    ALTER TABLE dnb_records_subset 
    ADD COLUMN IF NOT EXISTS token_count INTEGER;
""")
conn.commit()

# Directory path
directory_path = "../python/converter/temp_files/uploaded"

# Alle Einträge mit converted_file holen
cur.execute("SELECT id, converted_file FROM dnb_records_subset WHERE converted_file IS NOT NULL")
files = cur.fetchall()
total_files = len(files)

# Für jede Datei
for index, (file_id, filename) in enumerate(files, 1):
    file_path = os.path.join(directory_path, filename)
    
    try:
        with open(file_path, 'r') as f:
            content = f.read()
            # Zähle Tokens
            num_tokens = len(tokenizer.encode(content))
            
            # Update Database
            cur.execute("""
                UPDATE dnb_records_subset 
                SET token_count = %s
                WHERE id = %s
            """, (num_tokens, file_id))
            
        # Commit nach jeder 100. Datei
        if index % 100 == 0:
            conn.commit()
        
        # Prozent-Fortschritt ausgeben
        progress = (index / total_files) * 100
        if progress % 1 == 0:  # Bei jedem vollen Prozent
            print(f"Progress: {progress:.0f}% ({index}/{total_files}) - Current file: {file_id} with {num_tokens} tokens")
            
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")

# Final commit
conn.commit()

# Statistiken ausgeben
cur.execute("""
    SELECT 
        COUNT(*) as total_files,
        AVG(token_count) as avg_tokens,
        MIN(token_count) as min_tokens,
        MAX(token_count) as max_tokens,
        COUNT(*) FILTER (WHERE converted_file LIKE '%.mmd') as mmd_files
    FROM dnb_records_subset 
    WHERE token_count IS NOT NULL
""")
stats = cur.fetchone()
print("\nStatistics:")
print(f"Total files processed: {stats[0]}")
print(f"Average tokens per file: {stats[1]:.0f}")
print(f"Min tokens: {stats[2]}")
print(f"Max tokens: {stats[3]}")
print(f"Files with .mmd extension: {stats[4]}")

# Cleanup
cur.close()
conn.close()