In [1]:
import numpy as np
import pandas as pd
import hashlib
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import math

In [2]:
INPUT_FILE = "./large_dis_format.csv"
OUTPUT_FILE = "./k_dp_anonymized_data.csv"

# Secret salt for user ID hashing (never commit real salts to public repos!)
SECRET_SALT = "my_super_secret_salt"

# K-Anonymity parameters
K_LOCATION = 10  # Minimum cluster size for location
K_TIME = 5       # Minimum bin size for time

# Differential Privacy parameters
EPSILON_LOCATION = 1.0  # Privacy budget for location
EPSILON_TIME = 1.0      # Privacy budget for time

# Sensitivity assumptions for Laplace noise
#   Latitude in [-90,90] -> total range ~180
#   Longitude in [-180,180] -> total range ~360
SENSITIVITY_LAT = 180.0
SENSITIVITY_LON = 360.0

#   Hour in [0,23] -> total range ~24
#   Minute in [0,59] -> total range ~60
SENSITIVITY_HOUR = 24.0
SENSITIVITY_MINUTE = 60.0

In [3]:
def load_data(filepath=INPUT_FILE):
    """
    Expecting a tab-separated file with columns: id, date, latitude, longitude
    (no header row).
    """
    columns_name = ['id', 'date', 'latitude', 'longitude']
    df = pd.read_csv(filepath, names=columns_name, sep='\t')
    df['date'] = pd.to_datetime(df['date'])
    return df

def anonymize_id(df, id_col='id', date_col='date', salt=SECRET_SALT):
    """
    Anonymize user ID by hashing (user_id + date's ISO week + salt).
    If you want the same user ID across all weeks, remove `week` from the hash.
    """
    # Extract ISO week
    df['week'] = df[date_col].dt.isocalendar().week
    
    def hash_func(row):
        pseudo = f"{salt}_{row[id_col]}_{row['week']}"
        return hashlib.sha256(pseudo.encode()).hexdigest()[:10]

    df['anonymized_id'] = df.apply(hash_func, axis=1)
    return df

In [4]:
def strict_k_clusters_location(data, lat_col='latitude', lon_col='longitude', k_location=10):
    coords = data[[lat_col, lon_col]].values
    scaler = StandardScaler()
    scaled_coords = scaler.fit_transform(coords)

    # Initial K-Means
    n_clusters = max(1, len(data) // k_location)
    if n_clusters < 2:
        data['location_cluster'] = 0
    else:
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        data['location_cluster'] = kmeans.fit_predict(scaled_coords)

    # Merge small clusters until all have >= k_location
    while True:
        sizes = data['location_cluster'].value_counts()
        small_clusters = sizes[sizes < k_location]
        if small_clusters.empty:
            break
        for cid in small_clusters.index:
            target_cid = sizes.idxmax()
            data.loc[data['location_cluster'] == cid, 'location_cluster'] = target_cid
            sizes = data['location_cluster'].value_counts()
            small_clusters = sizes[sizes < k_location]

    return data

In [5]:
def dp_laplace_mechanism(value, epsilon, sensitivity):
    scale = sensitivity / epsilon
    noise = np.random.laplace(loc=0.0, scale=scale)
    return value + noise

def dp_on_location_clusters(data,
                            lat_col='latitude', lon_col='longitude',
                            cluster_col='location_cluster',
                            epsilon_loc=EPSILON_LOCATION,
                            sensitivity_lat=SENSITIVITY_LAT,
                            sensitivity_lon=SENSITIVITY_LON):
    dp_lookup = {}
    for cid in data[cluster_col].unique():
        subset = data[data[cluster_col] == cid]
        true_lat = subset[lat_col].mean()
        true_lon = subset[lon_col].mean()

        dp_lat = dp_laplace_mechanism(true_lat, epsilon_loc, sensitivity_lat)
        dp_lon = dp_laplace_mechanism(true_lon, epsilon_loc, sensitivity_lon)

        dp_lookup[cid] = (dp_lat, dp_lon)

    data['anonymized_latitude'] = data[cluster_col].apply(lambda c: dp_lookup[c][0])
    data['anonymized_longitude'] = data[cluster_col].apply(lambda c: dp_lookup[c][1])

    return data

In [6]:
def time_binning_k_anonymity(data, date_col='date', k_time=K_TIME):
    data['time_bin'] = data[date_col].dt.floor('H')
    bin_counts = data['time_bin'].value_counts()
    small_bins = bin_counts[bin_counts < k_time].index
    for bin in small_bins:
        nearest_bin = bin_counts.idxmax()
        data.loc[data['time_bin'] == bin, 'time_bin'] = nearest_bin
        bin_counts = data['time_bin'].value_counts()
        small_bins = bin_counts[bin_counts < k_time].index
    return data

In [7]:
def dp_on_time_bins(data, date_col='date', bin_col='time_bin',
                    epsilon_time=EPSILON_TIME,
                    sensitivity_hour=SENSITIVITY_HOUR,
                    sensitivity_minute=SENSITIVITY_MINUTE):
    dp_assignments = {}
    for b in data[bin_col].unique():
        subset = data[data[bin_col] == b]
        avg_h = subset[date_col].dt.hour.mean()
        avg_m = subset[date_col].dt.minute.mean()

        dp_h = dp_laplace_mechanism(avg_h, epsilon_time, sensitivity_hour)
        dp_m = dp_laplace_mechanism(avg_m, epsilon_time, sensitivity_minute)

        # round + clamp
        dp_h = min(max(int(round(dp_h)), 0), 23)
        dp_m = min(max(int(round(dp_m)), 0), 59)

        dp_assignments[b] = (dp_h, dp_m)

    data['anonymized_date'] = data.apply(lambda row:
        row[date_col].replace(
            hour=dp_assignments[row[bin_col]][0],
            minute=dp_assignments[row[bin_col]][1],
            second=0,
            microsecond=0
        ), axis=1
    )
    return data

In [8]:
def main():
    # 1) Load data
    df = load_data(INPUT_FILE)

    # 2) Anonymize user ID
    df = anonymize_id(df, id_col='id', date_col='date', salt=SECRET_SALT)

    # 3) K-Anonymity for location + DP
    df = strict_k_clusters_location(df,
                                    lat_col='latitude',
                                    lon_col='longitude',
                                    k_location=K_LOCATION)
    df = dp_on_location_clusters(df,
                                 lat_col='latitude',
                                 lon_col='longitude',
                                 cluster_col='location_cluster',
                                 epsilon_loc=EPSILON_LOCATION,
                                 sensitivity_lat=SENSITIVITY_LAT,
                                 sensitivity_lon=SENSITIVITY_LON)

    # 4) K-Anonymity for time + DP
    #    First, get time bins ensuring >= K_TIME
    df_time = time_binning_k_anonymity(df,
                                       date_col='date',
                                       k_time=K_TIME)
    #    Then, apply DP to each bin
    df_time = dp_on_time_bins(df_time,
                              date_col='date',
                              bin_col='time_bin',
                              epsilon_time=EPSILON_TIME,
                              sensitivity_hour=SENSITIVITY_HOUR,
                              sensitivity_minute=SENSITIVITY_MINUTE)

    # 5) Merge time back
    #    Now df_time['anonymized_date'] is the new time we want,
    #    while df already has the DP location. The two share the same index.
    df['anonymized_date'] = df_time['anonymized_date']

    # 6) Prepare final columns
    anonymized_data = df[['anonymized_id',
                          'anonymized_date',
                          'anonymized_latitude',
                          'anonymized_longitude']].copy()
    anonymized_data.columns = ['id', 'date', 'latitude', 'longitude']

    # 7) Save to file
    anonymized_data.to_csv(OUTPUT_FILE, index=False, sep='\t', header=False)
    print(f"[INFO] K-Anonymity + DP data saved to: {OUTPUT_FILE}")

if __name__ == "__main__":
    main()