In [9]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hashlib


# Load data
columns_name = ['id', 'date', 'latitude', 'longitude']
original_data = pd.read_csv('./large_dis_format.csv', names=columns_name, sep='\t')

# Convert 'date' to datetime format and extract ISO week
original_data['date'] = pd.to_datetime(original_data['date'])
original_data['week'] = original_data['date'].dt.isocalendar().week

In [10]:
original_data

Unnamed: 0,id,date,latitude,longitude,week
0,User_0,2024-06-10 04:13:00,-33.292177,-116.236513,24
1,User_1,2024-06-08 01:09:00,48.355743,154.027178,23
2,User_2,2024-06-01 03:15:00,80.976779,14.793070,22
3,User_3,2024-06-11 06:20:00,87.170156,104.938087,24
4,User_4,2024-06-14 07:44:00,-69.905373,-35.359927,24
...,...,...,...,...,...
95,User_95,2024-06-04 11:16:00,12.070538,-82.182233,23
96,User_96,2024-06-22 08:58:00,21.017826,91.817111,25
97,User_97,2024-06-05 04:42:00,-9.132491,-23.637439,23
98,User_98,2024-06-12 11:14:00,27.954847,-58.533795,24


In [11]:
# Generate anonymized id
def anonymize_id(user_id, week): 
    """Generate an anonymized id from the user_id and week"""
    pseudo = f"{user_id}_{week}"
    hashed_id =  hashlib.sha256(pseudo.encode()).hexdigest()[:10]
    return hashed_id  

original_data['anonymized_id'] = original_data.apply(
    lambda row: anonymize_id(row['id'], row['week']), axis=1
)   

In [12]:
# Standardiz latitude and longitude for clustering
scaler = StandardScaler()
coords = original_data[['latitude', 'longitude']]
scaled_coords = scaler.fit_transform(coords)

In [13]:
# Apply KMeans clustering for K-Anonymity
k = 10
kmeans = KMeans(n_clusters=int(len(coords)/k), random_state=42)
original_data['cluster'] = kmeans.fit_predict(scaled_coords)
original_data

Unnamed: 0,id,date,latitude,longitude,week,anonymized_id,cluster
0,User_0,2024-06-10 04:13:00,-33.292177,-116.236513,24,31475784aa,3
1,User_1,2024-06-08 01:09:00,48.355743,154.027178,23,ee22322e9a,4
2,User_2,2024-06-01 03:15:00,80.976779,14.793070,22,f11d5221ca,2
3,User_3,2024-06-11 06:20:00,87.170156,104.938087,24,653ccb10ae,8
4,User_4,2024-06-14 07:44:00,-69.905373,-35.359927,24,ac53428465,9
...,...,...,...,...,...,...,...
95,User_95,2024-06-04 11:16:00,12.070538,-82.182233,23,82f1e38b35,1
96,User_96,2024-06-22 08:58:00,21.017826,91.817111,25,85b6e2df25,4
97,User_97,2024-06-05 04:42:00,-9.132491,-23.637439,23,eb66d667ae,7
98,User_98,2024-06-12 11:14:00,27.954847,-58.533795,24,b3e0a39694,1


In [14]:
# Replace latitude and longitude with cluster centroids
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
original_data['anonymized_latitude'] = original_data['cluster'].apply(lambda x: centroids[x][0])
original_data['anonymized_longitude'] = original_data['cluster'].apply(lambda x: centroids[x][1])

In [15]:
# Save anonymized data
anonymized_data = original_data[['anonymized_id', 'date', 'anonymized_latitude', 'anonymized_longitude']]
anonymized_data.columns = ['id', 'date', 'latitude', 'longitude']
anonymized_data

Unnamed: 0,id,date,latitude,longitude
0,31475784aa,2024-06-10 04:13:00,-58.884005,-119.824138
1,ee22322e9a,2024-06-08 01:09:00,26.082774,134.681279
2,f11d5221ca,2024-06-01 03:15:00,60.330844,9.628185
3,653ccb10ae,2024-06-11 06:20:00,71.774695,108.545663
4,ac53428465,2024-06-14 07:44:00,-60.393339,23.489364
...,...,...,...,...
95,82f1e38b35,2024-06-04 11:16:00,9.648586,-110.949792
96,85b6e2df25,2024-06-22 08:58:00,26.082774,134.681279
97,eb66d667ae,2024-06-05 04:42:00,-0.052494,-6.958263
98,b3e0a39694,2024-06-12 11:14:00,9.648586,-110.949792


In [16]:
from zip import zip_csv_file
anonymized_data.to_csv('./k_anonymized_data.csv', index=False, sep='\t', header=False)
zip_csv_file('./k_anonymized_data.csv', './k_anonymized_data.zip')

Successfully created ./k_anonymized_data.zip


True