In [2]:
import os
import glob
import pandas as pd

In [19]:
def process_directory(path):
    """
    Process all CSV files in a directory, combine them into a dataset,
    check for duplicates, and return statistics.
    
    Args:
        path (str): Path to the directory containing CSV files
    
    Returns:
        tuple: (directory_name, original_length, cleaned_length, duplicate_count)
    """
    # Extract directory name for reporting
    leaf = os.path.basename(os.path.normpath(path))
    parent = os.path.basename(os.path.dirname(os.path.normpath(path)))
    directory_name = f"{parent}/{leaf}"
    
    # Find all CSV files in the directory
    csv_files = glob.glob(os.path.join(path, "*.csv"))
    
    if not csv_files:
        print(f"No CSV files found in {path}")
        return directory_name, 0, 0, 0
    
    # Read each CSV file and store as columns in a dataframe
    dataframes = {}
    for csv_file in csv_files:
        column_name = os.path.splitext(os.path.basename(csv_file))[0]
        try:
            # Assuming each CSV file contains a single column of data
            df = pd.read_csv(csv_file)
            # If the CSV has multiple columns, use the first one
            if len(df.columns) > 1:
                dataframes[column_name] = df.iloc[:, 0]
            else:
                dataframes[column_name] = df[df.columns[0]]
        except Exception as e:
            print(f"Error reading {csv_file}: {e}")
    
    # Create a single dataframe from all columns
    if dataframes:
        combined_df = pd.DataFrame(dataframes)
    else:
        print(f"No valid data found in CSV files in {path}")
        return directory_name, 0, 0, 0
    
    # Get original length
    original_length = len(combined_df)
    print(f"\nDirectory: {directory_name}")
    print(f"Original dataset shape: {combined_df.shape}")
    print(f"Original dataset length: {original_length}")
    
    # Check for duplicates
    duplicate_rows = combined_df.duplicated().sum()
    print(f"Number of duplicate rows: {duplicate_rows}")
    
    # Remove duplicates
    cleaned_df = combined_df.drop_duplicates().reset_index(drop=True)
    cleaned_length = len(cleaned_df)
    print(f"Cleaned dataset shape: {cleaned_df.shape}")
    print(f"Cleaned dataset length: {cleaned_length}")
    print(f"Removed {original_length - cleaned_length} duplicate rows")
    
    return directory_name, original_length, cleaned_length, duplicate_rows

def process_all_directories(base_path, additional_path=""):
    """
    Process all directories in the base path.
    
    Args:
        base_path (str): Base directory containing subdirectories with CSV files
    """
    results = []
    
    # Get all directories in the base path
    directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    
    if not directories:
        print(f"No subdirectories found in {base_path}")
        return
    
    # Process each directory
    for directory in directories:
        dir_path = os.path.join(base_path, directory, additional_path)
        result = process_directory(dir_path)
        results.append(result)
    
    # Display summary of results
    print("\n" + "="*100)
    print("SUMMARY OF RESULTS")
    print("="*100)
    print("{:<50} {:<15} {:<15} {:<15}".format(
        "Directory", "Original Size", "Cleaned Size", "Duplicates"))
    print("-"*100)
    
    for dir_name, orig_len, clean_len, dup_count in results:
        print("{:<50} {:<15} {:<15} {:<15}".format(
            dir_name, orig_len, clean_len, dup_count))


In [31]:
# new counts
path = os.path.join("/Users/lucas/Downloads", "data", "diabetes")
process_all_directories(path, additional_path="33-33-34")

No CSV files found in /Users/lucas/Downloads/data/diabetes/complete_weighted_specialization/33-33-34

Directory: specialization/33-33-34
Original dataset shape: (118057, 21)
Original dataset length: 118057
Number of duplicate rows: 23682
Cleaned dataset shape: (94375, 21)
Cleaned dataset length: 94375
Removed 23682 duplicate rows

Directory: generalization/33-33-34
Original dataset shape: (56553, 2)
Original dataset length: 56553
Number of duplicate rows: 0
Cleaned dataset shape: (56553, 2)
Cleaned dataset length: 56553
Removed 0 duplicate rows

Directory: forced_generalization/33-33-34
Original dataset shape: (70692, 1)
Original dataset length: 70692
Number of duplicate rows: 0
Cleaned dataset shape: (70692, 1)
Cleaned dataset length: 70692
Removed 0 duplicate rows
No CSV files found in /Users/lucas/Downloads/data/diabetes/complete_forced_generalization/33-33-34
No CSV files found in /Users/lucas/Downloads/data/diabetes/complete_extended_weighted_specialization/33-33-34
No CSV files f

In [None]:
# original counts
path = os.path.join(os.getcwd(), "datasets", "adult")
process_all_directories(path)


Directory: complete_forced_generalization
Original dataset shape: (45222, 1)
Original dataset length: 45222
Number of duplicate rows: 0
Cleaned dataset shape: (45222, 1)
Cleaned dataset length: 45222
Removed 0 duplicate rows

Directory: extended_weighted_specialization
Original dataset shape: (1489716, 13)
Original dataset length: 1489716
Number of duplicate rows: 1282764
Cleaned dataset shape: (206952, 13)
Cleaned dataset length: 206952
Removed 1282764 duplicate rows

Directory: forced_generalization
Original dataset shape: (45222, 1)
Original dataset length: 45222
Number of duplicate rows: 0
Cleaned dataset shape: (45222, 1)
Cleaned dataset length: 45222
Removed 0 duplicate rows

Directory: generalization
Original dataset shape: (36177, 2)
Original dataset length: 36177
Number of duplicate rows: 0
Cleaned dataset shape: (36177, 2)
Cleaned dataset length: 36177
Removed 0 duplicate rows
No CSV files found in /home/sc.uni-leipzig.de/ll95wyqa/projects/user-driven-privacy/datasets/adult/