In [33]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hashlib


# Load data
columns_name = ['id', 'date', 'latitude', 'longitude']
original_data = pd.read_csv('./origin_data.csv', names=columns_name, sep='\t')

# Convert 'date' to datetime format and extract ISO week
original_data['date'] = pd.to_datetime(original_data['date'])
original_data['week'] = original_data['date'].dt.isocalendar().week

In [34]:
original_data

Unnamed: 0,id,date,latitude,longitude,week
0,7129300520,2014-10-14 14:24:00,47.4905,-122.2545,42
1,7129300521,2014-10-15 20:57:00,47.4928,-122.2580,42
2,7129300522,2014-10-19 12:55:00,47.4917,-122.2516,42
3,7129300523,2014-10-13 16:08:00,47.4919,-122.2553,42
4,7129300524,2014-10-20 04:23:00,47.4905,-122.2560,43
...,...,...,...,...,...
95,7129300615,2014-10-19 08:26:00,47.4902,-122.2437,42
96,7129300616,2014-10-16 17:45:00,47.5086,-122.2579,42
97,7129300617,2014-10-14 20:40:00,47.4953,-122.2422,42
98,7129300618,2014-10-19 15:04:00,47.4931,-122.2544,42


In [35]:
# Generate anonymized id
def anonymize_id(user_id, week): 
    """Generate an anonymized id from the user_id and week"""
    pseudo = f"{user_id}_{week}"
    hashed_id =  hashlib.sha256(pseudo.encode()).hexdigest()[:10]
    return hashed_id  

original_data['anonymized_id'] = original_data.apply(
    lambda row: anonymize_id(row['id'], row['week']), axis=1
)   

In [36]:
# Standardiz latitude and longitude for clustering
scaler = StandardScaler()
coords = original_data[['latitude', 'longitude']]
scaled_coords = scaler.fit_transform(coords)

In [37]:
# Apply KMeans clustering for K-Anonymity
k = 10
kmeans = KMeans(n_clusters=int(len(coords)/k), random_state=42)
original_data['cluster'] = kmeans.fit_predict(scaled_coords)
original_data

Unnamed: 0,id,date,latitude,longitude,week,anonymized_id,cluster
0,7129300520,2014-10-14 14:24:00,47.4905,-122.2545,42,7824ba9205,2
1,7129300521,2014-10-15 20:57:00,47.4928,-122.2580,42,120a6dbd6b,7
2,7129300522,2014-10-19 12:55:00,47.4917,-122.2516,42,0deb742a9d,2
3,7129300523,2014-10-13 16:08:00,47.4919,-122.2553,42,9f44a7f18e,2
4,7129300524,2014-10-20 04:23:00,47.4905,-122.2560,43,c0ff0d932c,2
...,...,...,...,...,...,...,...
95,7129300615,2014-10-19 08:26:00,47.4902,-122.2437,42,b74f2b059e,4
96,7129300616,2014-10-16 17:45:00,47.5086,-122.2579,42,ea87f497f8,3
97,7129300617,2014-10-14 20:40:00,47.4953,-122.2422,42,7cf144f061,8
98,7129300618,2014-10-19 15:04:00,47.4931,-122.2544,42,bee3670abd,2


In [38]:
# Replace latitude and longitude with cluster centroids
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
original_data['anonymized_latitude'] = original_data['cluster'].apply(lambda x: centroids[x][0])
original_data['anonymized_longitude'] = original_data['cluster'].apply(lambda x: centroids[x][1])

In [39]:
# Save anonymized data
anonymized_data = original_data[['anonymized_id', 'date', 'anonymized_latitude', 'anonymized_longitude']]
anonymized_data.columns = ['id', 'date', 'latitude', 'longitude']
anonymized_data

Unnamed: 0,id,date,latitude,longitude
0,7824ba9205,2014-10-14 14:24:00,47.493107,-122.253567
1,120a6dbd6b,2014-10-15 20:57:00,47.493633,-122.257875
2,0deb742a9d,2014-10-19 12:55:00,47.493107,-122.253567
3,9f44a7f18e,2014-10-13 16:08:00,47.493107,-122.253567
4,c0ff0d932c,2014-10-20 04:23:00,47.493107,-122.253567
...,...,...,...,...
95,b74f2b059e,2014-10-19 08:26:00,47.492170,-122.245560
96,ea87f497f8,2014-10-16 17:45:00,47.507442,-122.257917
97,7cf144f061,2014-10-14 20:40:00,47.497270,-122.241220
98,bee3670abd,2014-10-19 15:04:00,47.493107,-122.253567


In [40]:
from zip import zip_csv_file
anonymized_data.to_csv('./k_anonymized_data.csv', index=False, sep='\t', header=False)
zip_csv_file('./k_anonymized_data.csv', './k_anonymized_data.zip')

Successfully created ./k_anonymized_data.zip


True