# Specialization Dataset Creation (Record-Based Approach)

This notebook creates specialized datasets using the **record-based approach** from Main.py, RecordBasedFiltering.py, and FilteringHandler.py.

**Note:** May take significant space and time depending on parameters and datasets - take care if running locally.

## How It Works

1. Loads generalized data (from `data/<dataset>/generalization/<percentages>/`)
2. For each record, generates variants by expanding generalized values
3. Applies filtering (random, imputation, knn, or none)
4. Saves filtered specialization data

## Filtering Modes

- **`random`**: Randomly select n_duplicates variants per record
- **`imputation`**: Select best n_duplicates using profile-based scoring
- **`knn`**: Select best n_duplicates using KNN similarity (not used due to similar results at higher compute overhead)
- **`None`**: No filtering (n_duplicates=0 keeps only unique records, None keeps all)

## Realistic Mode

- **`True`**: Only use values observed in original/generalized data (realistic)
- **`False`**: Use all possible hierarchy values (unrealistic but exhaustive)

## Output

Saves to: `data/<dataset>/specialization/<percentages>/specialization_<filtering_mode>_n<n_duplicates>.csv`

## 1. Import Required Libraries

In [None]:
from src.RecordBasedSpecialization import RecordBasedSpecialization
from src.DatasetManager import DatasetManager
from src.Vorverarbeitung import extract_observed_values
import pandas as pd
import os

import time

print("âœ“ Imports successful")

## 2. Configure Parameters

Modify these parameters to create different specialization datasets:

In [None]:
# ========== CONFIGURATION ==========

# Dataset to process
dataset = 'german'  # Options: 'adult', 'german', 'diabetes', 'employment'

# Percentages for specialization (format: 'X-Y-Z' where X+Y+Z=100)
# Must match a folder that exists in data/<dataset>/generalization/
percentages = '66-17-17'

# Data directory
data_dir = 'data'

# Number of variants per record (0 = only unique, None = all variants)
n_duplicates = 2

# Filtering mode
filtering_mode = 'imputation'  # Options: 'random', 'imputation', 'knn', None

#Handle "none" filtering
if filtering_mode is not None and filtering_mode.lower() == 'none':
    filtering_mode = None
    n_duplicates = None

if n_duplicates == 0:
    filtering_mode = None

# Realistic mode (uses observed values only)
# True = Only create variants with values observed in generalized data (REALISTIC)
# False = Use all possible hierarchy values (UNREALISTIC but exhaustive)

limit_to_observed_values = True

# Random seed for reproducibility
seed = 42

print("Configuration:")
print(f"  Dataset: {dataset}")
print(f"  Percentages: {percentages}")
print(f"  filtering_mode: {filtering_mode or 'none'}")
print(f"  n_duplicates: {n_duplicates or 'unlimited (no filtering)'}")
print(f"  Realistic mode: {limit_to_observed_values}")
print(f"  Seed: {seed}")

## 3. Create Specialization Dataset

This cell runs the specialization process. Depending on the dataset size and parameters, this may take several minutes.

In [None]:
print("="*80)
print("RECORD-BASED SPECIALIZATION")
print("="*80)
print(f"Dataset: {dataset}")
print(f"Percentages: {percentages}")
print(f"n_duplicates: {n_duplicates or 'unlimited (no filtering)'}")
print(f"filtering_mode: {filtering_mode or 'none'}")
print(f"Realistic mode: {limit_to_observed_values}")
print("="*80)

start_time = time.time()

try:
    # Load generalized data
    train_path = os.path.join(data_dir, dataset, 'generalization', percentages, f'{dataset}_train.csv')
    test_path = os.path.join(data_dir, dataset, 'generalization', percentages, f'{dataset}_test.csv')
    
    print(f"\nLoading generalized data...")
    print(f"  Train: {train_path}")
    print(f"  Test: {test_path}")
    
    data_train_gen = pd.read_csv(train_path)
    data_test_gen = pd.read_csv(test_path)
    
    print(f"  Train rows: {len(data_train_gen):,}")
    print(f"  Test rows: {len(data_test_gen):,}")
    
    # Get dataset configuration
    spalten_dict, spalten_list = DatasetManager.get_spalten_classes(dataset)
    numerical_columns = DatasetManager.get_numerical_columns(dataset)
    record_id_col = DatasetManager.get_record_id_column(dataset)
    label_col = DatasetManager.get_label_column(dataset)
    
    # Extract observed values if in realistic mode
    observed_values_dict = {}
    if limit_to_observed_values:
        print(f"\nExtracting observed values from generalized data...")
        observed_values_dict = extract_observed_values(dataset, data_train_gen, data_dir)
    
    # Create record-based processor
    print(f"\nInitializing RecordBasedSpecialization...")
    rbs = RecordBasedSpecialization(
        dataset_name=dataset,
        spalten_list=spalten_list,
        numerical_columns=numerical_columns,
        record_id_col=record_id_col,
        label_col=label_col,
        observed_values_dict=observed_values_dict,
        limit_to_observed_values=limit_to_observed_values,
        seed=seed
    )
    
    # Process data with single configuration
    print(f"\nProcessing with record-based method...")
    print(f"  This may take several minutes depending on dataset size...")
    
    result_train = rbs.process_data(
        df=data_train_gen,
        n_duplicates=n_duplicates,
        filtering_mode=filtering_mode,
        original_reference_data=data_train_gen
    )
    
    result_test = rbs.process_data(
        df=data_test_gen,
        n_duplicates=n_duplicates,
        filtering_mode=filtering_mode,
        original_reference_data=data_test_gen
    )
    
    # Convert from Dask to Pandas
    print(f"\nConverting results to Pandas DataFrames...")
    result_train = result_train.compute()
    result_test = result_test.compute()
    
    # Combine train and test
    result_df = pd.concat([result_train, result_test], ignore_index=True)
    
    elapsed = time.time() - start_time
    
    print("\n" + "="*80)
    print("PROCESSING COMPLETE")
    print("="*80)
    print(f"Total rows: {len(result_df):,}")
    print(f"  Train: {len(result_train):,}")
    print(f"  Test: {len(result_test):,}")
    print(f"Unique record_ids: {result_df[record_id_col].nunique()}")
    if n_duplicates:
        avg_variants = len(result_df) / result_df[record_id_col].nunique()
        print(f"Avg variants per record: {avg_variants:.2f}")
    print(f"Processing time: {elapsed/60:.2f} minutes ({elapsed:.1f} seconds)")
    
    print("\n" + "="*80)
    print("SUCCESS!")
    print("="*80)
    
except Exception as e:
    print("\n" + "="*80)
    print("ERROR!")
    print("="*80)
    print(f"Failed to process specialization: {e}")
    import traceback
    traceback.print_exc()
    raise

## 4. Preview Results

Let's inspect one of the generated files:


In [None]:
print(f"Dataset shape: {result_df.shape}")
print(f"\nColumns: {list(result_df.columns)}")
print(f"\nFirst few rows:")
result_df.head(10)

In [None]:
# Check variants distribution if record_id exists
if record_id_col in result_df.columns:
    variants_per_record = result_df.groupby(record_id_col).size()
    print("Variants per record statistics:")
    print(variants_per_record.describe())
    print(f"\nValue counts:")
    print(variants_per_record.value_counts().sort_index())

## 5. Save Results

Save the specialized dataset to disk:

In [None]:
# Create output directory
output_dir = os.path.join(data_dir, dataset, 'specialization', percentages)
os.makedirs(output_dir, exist_ok=True)

# Create descriptive filename with parameters
if filtering_mode is None:
    filename = 'specialization_unfiltered.csv'
else:
    n_dup_str = n_duplicates if n_duplicates is not None else 'all'
    filename = f'specialization_{filtering_mode}_n{n_dup_str}.csv'

output_path = os.path.join(output_dir, filename)

print(f"Saving to: {output_path}")
result_df.to_csv(output_path, index=False)

file_size_mb = os.path.getsize(output_path) / 1e6

print(f"File size: {file_size_mb:.1f} MB")
print(f"  {output_path}")

print("\n" + "="*80)
print(f"\nFiltered specialization saved to:")
print("SUCCESS!")
print("="*80)

print(f"\nYou may use this file instead of running specialization + filtering.")
print(f"Actual specialization workflow still creates it in memory and does not read from or write to disk.")

## Optional: Batch Processing

Use this cell to process multiple configurations in one run. But be cautious, especially on local setups!

In [None]:
# Example: Process multiple filtering configurations efficiently
batch_configs = [
    (0, None),       # Keep only unique records
    (2, 'random'),   # Keep 2 random variants
    (2, 'imputation'), # Keep 2 best variants by imputation
    # Add more (n_duplicates, mode) tuples as needed
]

# Uncomment to run batch processing:
# print(f"\nBatch processing {len(batch_configs)} configurations...")
# 
# # Load generalized data once
# train_path = os.path.join(data_dir, dataset, 'generalization', percentages, f'{dataset}_train.csv')
# test_path = os.path.join(data_dir, dataset, 'generalization', percentages, f'{dataset}_test.csv')
# data_train_gen = pd.read_csv(train_path)
# data_test_gen = pd.read_csv(test_path)
# 
# # Create processor once
# observed_values_dict = extract_observed_values(dataset, data_train_gen, data_dir) if limit_to_observed_values else {}
# rbs = RecordBasedSpecialization(
#     dataset_name=dataset,
#     spalten_list=spalten_list,
#     numerical_columns=numerical_columns,
#     record_id_col=record_id_col,
#     label_col=label_col,
#     observed_values_dict=observed_values_dict,
#     limit_to_observed_values=limit_to_observed_values,
#     seed=seed
# )
# 
# # Process all configs in one batch (optimized - generates once per mode)
# print("Processing train data...")
# train_results = rbs.process_data_batch(data_train_gen, batch_configs, data_train_gen)
# print("Processing test data...")
# test_results = rbs.process_data_batch(data_test_gen, batch_configs, data_test_gen)
# 
# # Save each configuration
# for (n_dup, mode) in batch_configs:
#     print(f"\nSaving config: n_duplicates={n_dup}, mode={mode}")
#     
#     # Combine train and test
#     combined = pd.concat([
#         train_results[(n_dup, mode)].compute(),
#         test_results[(n_dup, mode)].compute()
#     ], ignore_index=True)
#     
#     # Create filename
#     output_dir = os.path.join(data_dir, dataset, 'specialization', percentages)
#     os.makedirs(output_dir, exist_ok=True)
#     
#     if mode is None:
#         filename = f'specialization_unique.csv' if n_dup == 0 else 'specialization_unfiltered.csv'
#     else:
#         filename = f'specialization_{mode}_n{n_dup}.csv'
#     
#     output_path = os.path.join(output_dir, filename)
#     combined.to_csv(output_path, index=False)
#     
#     file_size_mb = os.path.getsize(output_path) / 1e6
#     print(f"  Saved: {output_path} ({file_size_mb:.1f} MB, {len(combined):,} rows)")

print("Batch processing cell ready (currently commented out)")