In [None]:
import psycopg2
import csv
from dotenv import load_dotenv
import os
from collections import defaultdict

# Load environment variables
load_dotenv()

def connect_to_db():
    return psycopg2.connect(
        host=os.getenv("DB_HOST"),
        port=os.getenv("DB_PORT"),
        database=os.getenv("DB_NAME"),
        user=os.getenv("DB_USER"),
        password=os.getenv("DB_PASSWORD"),
    )

# Read CSV and create mapping
gnd_mapping = defaultdict(list)
with open("../data/ger_open_access_with_gnd.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        gnd_mapping[row['doc_idn']].append({
            'gnd_idn': row['gnd_idn'],
            'gnd_label': row['gnd_label']
        })

# Connect to database
conn = connect_to_db()
cur = conn.cursor()

# Get all IDNs from database
cur.execute("SELECT idn FROM dnb_records")
db_idns = set(row[0] for row in cur.fetchall())

# Calculate statistics
total_db_records = len(db_idns)
matching_records = sum(1 for idn in db_idns if idn in gnd_mapping)
total_gnd_mappings = sum(len(gnds) for gnds in gnd_mapping.values())
avg_gnd_per_doc = total_gnd_mappings / len(gnd_mapping) if gnd_mapping else 0

print(f"Statistics:")
print(f"Total records in database: {total_db_records}")
print(f"Records with GND mappings: {matching_records}")
print(f"Percentage of records with GND: {(matching_records/total_db_records)*100:.2f}%")
print(f"Average GND entries per document: {avg_gnd_per_doc:.2f}")

# Get distribution of GND count per document
gnd_counts = [len(gnds) for gnds in gnd_mapping.values()]
if gnd_counts:
    print(f"\nGND count distribution:")
    print(f"Min GNDs per document: {min(gnd_counts)}")
    print(f"Max GNDs per document: {max(gnd_counts)}")

cur.close()
conn.close()

In [None]:
def analyze_duplicates(file_path: str) -> None:
    """Analyze duplicates in the CSV file"""
    df = pd.read_csv(file_path)
    
    # Check for exact duplicates
    duplicates = df[df.duplicated()]
    if not duplicates.empty:
        print(f"Found {len(duplicates)} exact duplicate rows")
        print("\nExample duplicates:")
        print(duplicates.head())
    
    # Check for duplicate doc_idn/gnd_idn combinations
    dup_combinations = df[df.duplicated(['doc_idn', 'gnd_idn'])]
    if not dup_combinations.empty:
        print(f"\nFound {len(dup_combinations)} duplicate doc_idn/gnd_idn combinations")
        print("\nExample duplicate combinations:")
        print(dup_combinations.head())


analyze_duplicates("../data/ger_open_access_with_gnd.csv")