In [1]:
import pandas as pd
from sqlalchemy import create_engine

def degeneralize_age(age_range):
    age_lower, _ = age_range.split(" - ")
    return int(age_lower)


In [2]:

def partition(data, k):
        if len(data) < 2 * k:
            return [data]
        
        # Choose dimension with largest range
        ranges = {}
        for col in ['age_numeric', 'postal_code']:
            if col == 'age_numeric':
                ranges[col] = data[col].max() - data[col].min()
            else:
                ranges[col] = data[col].nunique()
        
        split_dim = max(ranges, key=ranges.get)
        
        # Split data
        if split_dim == 'age_numeric':
            split_value = data[split_dim].median()
            left = data[data[split_dim] <= split_value]
            right = data[data[split_dim] > split_value]
        else:
            # For postal code, split by most frequent value
            common_val = data[split_dim].mode()[0]
            left = data[data[split_dim] == common_val]
            right = data[data[split_dim] != common_val]
        
        if len(left) >= k and len(right) >= k:
            return partition(left, k) + partition(right, k)
        else:
            return [data]

In [3]:
        
def k_anonymity(df, k=3):

    result_df = df.copy()

    # remove "20-24" age format 
    result_df['age_numeric'] = result_df['age'].apply(degeneralize_age)

    partitions = partition(result_df, k)

    # Generalize each partition
    final_partitions = []
    for part in partitions:
        generalized = part.copy()
        
        # Generalize age range for this specific partition
        age_min = generalized['age_numeric'].min()
        age_max = generalized['age_numeric'].max()
        generalized['age'] = f"{age_min}-{age_max}"
        
        # Generalize postal code for this specific partition
        postal_prefix = generalized['postal_code'].str[:3].iloc[0] + '***'
        generalized['postal_code'] = postal_prefix
        
        final_partitions.append(generalized)
    
    # Combine all partitions
    result = pd.concat(final_partitions, ignore_index=True)
    result = result.drop('age_numeric', axis=1)
    
    return result

In [None]:
# Load your existing de-identified data
engine = create_engine("postgresql://postgresql:password@localhost:5432/pds_proj_1")
med_df = pd.read_sql_table('med_data', engine)


current_groups = med_df.groupby(['age', 'postal_code', 'gender']).size()

In [5]:
# Apply Mondrian k-anonymity
k = 3  

anonymized_df = k_anonymity(med_df, k=k)

In [6]:
# Verify k-anonymity
final_groups = anonymized_df.groupby(['age', 'postal_code', 'gender']).size()
print(f"\nVerification:")
print(f"Total groups: {len(final_groups)}")
print(f"Minimum group size: {final_groups.min()}")
print(f"Satisfies k-anonymity: {final_groups.min() >= k}")

# Show group distribution
group_stats = final_groups.value_counts().sort_index()
print(f"\nGroup size distribution:")
for size, count in group_stats.items():
    print(f"  Size {size}: {count} groups")


Verification:
Total groups: 44
Minimum group size: 1
Satisfies k-anonymity: False

Group size distribution:
  Size 1: 16 groups
  Size 2: 13 groups
  Size 3: 5 groups
  Size 4: 2 groups
  Size 9: 1 groups
  Size 12: 2 groups
  Size 18: 2 groups
  Size 20: 1 groups
  Size 396: 1 groups
  Size 450: 1 groups


In [7]:
# Save to new table
anonymized_df.to_sql('med_data_k_anonymous', engine, if_exists='replace', index=False)

anonymized_df.head()

Unnamed: 0,id,age,gender,postal_code,diagnosis
0,17188ddc-5b56-40c5-9d76-15385f0a1cdc,30-65,Male,606***,T23419A
1,2dd3517c-027f-4b24-9bf0-a7c192494e38,30-65,Female,606***,K4131
2,64d69c34-53a6-4dfe-96dc-188e9879eda1,30-65,Male,606***,S83269S
3,68c39879-9184-44ca-a4ab-6e935dd8ef32,30-65,Male,606***,S23428
4,812240c0-81cf-4785-a6b7-bb0ee406f562,30-65,Female,606***,S93122
