In [6]:
import pandas as pd
import hashlib
import random
from datetime import datetime, timedelta
import numpy as np

def add_laplace_noise(value, epsilon, sensitivity=20):
    """Add Laplace noise to a value based on epsilon and sensitivity."""
    """
     Sensitivity depends on the maximum possible geographic distance between two points.
     Epsilon: the level of privacy protection
     ε > 1 : Low Privacy, High Utility
     0.1 <= ε <= 1: Moderate Privacy, Moderate Utility
     ε < 0.1: High Privacy, Low Utility
    """
    scale = sensitivity / epsilon
    noise = np.random.laplace(loc=0.0, scale=scale)
    return value + noise

def anonymize_data_improved(input_file, output_file, gps_epsilon=1.0, time_shift_minutes=0):
    """
    An improved anonymization algorithm with enhanced privacy and utility.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the output anonymized CSV file.
        gps_epsilon (float): Privacy budget (ε) for GPS anonymization.
        time_shift_minutes (int): Maximum random shift (in minutes) for timestamps.

    Returns:
        None. Saves the anonymized dataset to the output file.
    """
    # Load the data with custom delimiter (tab-separated)
    df = pd.read_csv(input_file, delimiter="\t", names=["ID", "Date", "Latitude", "Longitude"])
    
    # Convert the Date column to datetime
    df["Date"] = pd.to_datetime(df["Date"])
    
    # Add week and year columns for grouping
    df["Week"] = df["Date"].dt.isocalendar().week
    df["Year"] = df["Date"].dt.isocalendar().year

    # Generate anonymized IDs
    def hash_id(row):
        raw = f"{row['ID']}.{row['Year']}-{row['Week']}"
        return hashlib.md5(raw.encode()).hexdigest()[:8]

    df["AnonID"] = df.apply(hash_id, axis=1)

    # Modify dates to stay within the same week and optionally add noise
    def randomize_date(date):
        start_of_week = date - timedelta(days=date.weekday())
        randomized_date = start_of_week + timedelta(days=random.randint(0, 6))
        time_shift = timedelta(minutes=random.randint(-time_shift_minutes, time_shift_minutes))
        return randomized_date + time_shift

    df["AnonDate"] = df["Date"].apply(randomize_date)

    # Add Laplace noise to GPS coordinates
    df["AnonLatitude"] = df["Latitude"].apply(lambda x: add_laplace_noise(x, gps_epsilon))
    df["AnonLongitude"] = df["Longitude"].apply(lambda x: add_laplace_noise(x, gps_epsilon))

    # Save the anonymized dataset
    anonymized_df = df[["AnonID", "AnonDate", "AnonLatitude", "AnonLongitude"]]
    anonymized_df.columns = ["ID", "Date", "Latitude", "Longitude"]
    anonymized_df.to_csv(output_file, index=False, sep="\t")
    print(f"Improved anonymized data saved to {output_file}")

# Example usage
anonymize_data_improved(
    input_file="../file_origin/geo_data_format.csv",  
    output_file="../file_ano/anonymized_geo_data_improved.csv",  
    gps_epsilon=1.0,  # Adjust ε for privacy level
    time_shift_minutes=0  # Adjust temporal shift if needed
)

Anonymized data saved to ../file_ano/anonymized_geo_data_1.csv
