In [7]:
import pandas as pd

def merge_csv_files(file1, file2, output_file):
    """
    Merge two CSV files, keeping all columns from both files.
    """
    # Read the CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Merge the dataframes, keeping all columns
    merged_df = pd.concat([df1, df2], ignore_index=True, sort=False)  # Keep all columns, avoid sorting

    # Save the merged dataframe as a CSV file
    merged_df.to_csv(output_file, index=False)

    print(f"✅ Merging completed. '{output_file}' saved successfully!")

# Merge dataset metadata
merge_csv_files('../01_opendata.swiss/opendata_datasets_metadata.csv', 
                '../02_geocat.ch/geocat_dataset_metadata.csv', 
                'merged_dataset_metadata.csv')

# Merge distribution metadata
merge_csv_files('../01_opendata.swiss/opendata_distribution_metadata.csv', 
                '../02_geocat.ch/geocat_distribution_metadata.csv', 
                'merged_distribution_metadata.csv')

# Merge contact metadata
merge_csv_files('../01_opendata.swiss/opendata_contact_metadata.csv', 
                '../02_geocat.ch/geocat_contact_metadata.csv', 
                'merged_contact_metadata.csv')


✅ Merging completed. 'merged_dataset_metadata.csv' saved successfully!


  df1 = pd.read_csv(file1)


✅ Merging completed. 'merged_distribution_metadata.csv' saved successfully!
✅ Merging completed. 'merged_contact_metadata.csv' saved successfully!


In [3]:
import pandas as pd
from rapidfuzz import fuzz  # Faster alternative to fuzzywuzzy
import os

# File paths
INPUT_FILE = "merged_dataset_metadata.csv"  # Change to your actual file path
OUTPUT_FILE_CLEANED = "cleaned_dataset_metadata.csv"
OUTPUT_FILE_REMOVED = "removed_duplicates.csv"

# Similarity threshold (0-100): Higher means stricter matching
SIMILARITY_THRESHOLD = 90  

def find_and_remove_duplicates(df):
    """
    Identify and remove similar records while keeping 'opendata.swiss' over 'geocat.ch'.
    Returns both cleaned and removed data.
    """
    df_sorted = df.sort_values(by=["origin"], ascending=True)  # Ensures opendata.swiss comes first
    to_remove = set()
    removed_data = []

    for i, row1 in df_sorted.iterrows():
        if i in to_remove:  # Skip already marked duplicates
            continue
        for j, row2 in df_sorted.iterrows():
            if i >= j or j in to_remove:  # Avoid redundant checks
                continue

            # Check same dataset identifier (ignoring minor differences like suffixes)
            id_sim = fuzz.ratio(str(row1["dataset_identifier"]), str(row2["dataset_identifier"]))

            # Check title similarity
            title_sim = fuzz.ratio(str(row1["dataset_title_DE"]), str(row2["dataset_title_DE"]))

            # If both ID and title are highly similar, mark one for removal
            if id_sim > SIMILARITY_THRESHOLD and title_sim > SIMILARITY_THRESHOLD:
                if row1["origin"] == "geocat.ch":
                    to_remove.add(i)
                    removed_data.append(row1)  # Store removed data
                else:
                    to_remove.add(j)
                    removed_data.append(row2)

    # Create DataFrame of removed duplicates
    df_removed = pd.DataFrame(removed_data)

    # Remove marked duplicates from the original dataset
    df_cleaned = df_sorted.drop(index=to_remove)

    return df_cleaned, df_removed

if __name__ == "__main__":
    # Ensure input file exists
    if os.path.exists(INPUT_FILE):
        df = pd.read_csv(INPUT_FILE)

        # Run duplicate detection and removal
        df_cleaned, df_removed = find_and_remove_duplicates(df)

        # Save cleaned dataset (without duplicates)
        df_cleaned.to_csv(OUTPUT_FILE_CLEANED, index=False)

        # Save removed duplicates
        df_removed.to_csv(OUTPUT_FILE_REMOVED, index=False)

        print(f"✅ Cleaned dataset saved as: {OUTPUT_FILE_CLEANED}")
        print(f"✅ Removed duplicates saved as: {OUTPUT_FILE_REMOVED}")
    else:
        print(f"❌ Error: Input file '{INPUT_FILE}' not found.")


  df = pd.read_csv(INPUT_FILE)


KeyboardInterrupt: 