In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hashlib


# Load data
columns_name = ['id', 'date', 'latitude', 'longitude']
original_data = pd.read_csv('./large_distributed_data_format.csv', names=columns_name, sep='\t')

# Convert 'date' to datetime format and extract ISO week
original_data['date'] = pd.to_datetime(original_data['date'])
original_data['week'] = original_data['date'].dt.isocalendar().week

In [2]:
original_data

Unnamed: 0,id,date,latitude,longitude,week
0,User_0,2024-06-01,35.577072,-128.499355,22
1,User_1,2024-06-02,21.501173,-117.107356,22
2,User_2,2024-06-03,40.458849,-90.398031,23
3,User_3,2024-06-04,47.187183,-126.283670,23
4,User_4,2024-06-05,28.876873,-130.212167,23
...,...,...,...,...,...
95,User_0,2024-09-04,74.375266,-160.787291,36
96,User_1,2024-09-05,67.992258,-122.604064,36
97,User_2,2024-09-06,76.225230,-126.819054,36
98,User_3,2024-09-07,81.013012,-175.964700,36


In [3]:
# Generate anonymized id
def anonymize_id(user_id, week): 
    """Generate an anonymized id from the user_id and week"""
    pseudo = f"{user_id}_{week}"
    hashed_id =  hashlib.sha256(pseudo.encode()).hexdigest()[:10]
    return hashed_id  

original_data['anonymized_id'] = original_data.apply(
    lambda row: anonymize_id(row['id'], row['week']), axis=1
)   

In [4]:
# Standardiz latitude and longitude for clustering
scaler = StandardScaler()
coords = original_data[['latitude', 'longitude']]
scaled_coords = scaler.fit_transform(coords)

In [5]:
# Apply KMeans clustering for K-Anonymity
k = 10
kmeans = KMeans(n_clusters=int(len(coords)/k), random_state=42)
original_data['cluster'] = kmeans.fit_predict(scaled_coords)
original_data

Unnamed: 0,id,date,latitude,longitude,week,anonymized_id,cluster
0,User_0,2024-06-01,35.577072,-128.499355,22,19a9f4ab08,9
1,User_1,2024-06-02,21.501173,-117.107356,22,2a1335498e,4
2,User_2,2024-06-03,40.458849,-90.398031,23,1f1b58d8f3,2
3,User_3,2024-06-04,47.187183,-126.283670,23,fdcaad4573,0
4,User_4,2024-06-05,28.876873,-130.212167,23,d17e53e274,4
...,...,...,...,...,...,...,...
95,User_0,2024-09-04,74.375266,-160.787291,36,56bbc547f3,1
96,User_1,2024-09-05,67.992258,-122.604064,36,cf8e6eb836,8
97,User_2,2024-09-06,76.225230,-126.819054,36,271e84e096,8
98,User_3,2024-09-07,81.013012,-175.964700,36,7bebdc9e2c,1


In [6]:
# Replace latitude and longitude with cluster centroids
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
original_data['anonymized_latitude'] = original_data['cluster'].apply(lambda x: centroids[x][0])
original_data['anonymized_longitude'] = original_data['cluster'].apply(lambda x: centroids[x][1])

In [7]:
# Save anonymized data
anonymized_data = original_data[['anonymized_id', 'date', 'anonymized_latitude', 'anonymized_longitude']]
anonymized_data.columns = ['id', 'date', 'latitude', 'longitude']
anonymized_data

Unnamed: 0,id,date,latitude,longitude
0,19a9f4ab08,2024-06-01,40.837251,-135.686410
1,2a1335498e,2024-06-02,26.953483,-122.599179
2,1f1b58d8f3,2024-06-03,33.063538,-94.999270
3,fdcaad4573,2024-06-04,51.732210,-121.747953
4,d17e53e274,2024-06-05,26.953483,-122.599179
...,...,...,...,...
95,56bbc547f3,2024-09-04,83.934352,-164.792574
96,cf8e6eb836,2024-09-05,67.840544,-127.929693
97,271e84e096,2024-09-06,67.840544,-127.929693
98,7bebdc9e2c,2024-09-07,83.934352,-164.792574


In [8]:
from zip import zip_csv_file
anonymized_data.to_csv('./k_anonymized_data.csv', index=False, sep='\t', header=False)
zip_csv_file('./k_anonymized_data.csv', './k_anonymized_data.zip')

Error: The file ./origin.csv does not exist.
Successfully created ./k_anonymized_data.zip


True