In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import hashlib


# Load data
columns_name = ['id', 'date', 'latitude', 'longitude']
# original_data = pd.read_csv('../file_origin/geo_data_format.csv', names=columns_name, sep='\t')
original_data = pd.read_csv('../file_origin/big_survey_results.csv', names=columns_name, sep='\t')
# Convert 'date' to datetime format and extract ISO week
original_data['date'] = pd.to_datetime(original_data['date'])
original_data['week'] = original_data['date'].dt.isocalendar().week

In [2]:
original_data

Unnamed: 0,id,date,latitude,longitude,week
0,1,2025-01-01 07:23:58,48.896039,2.209185,1
1,1,2025-01-01 13:29:59,48.895000,2.210958,1
2,1,2025-01-01 15:01:43,48.893924,2.211572,1
3,2,2025-01-01 11:23:57,48.805277,2.237711,1
4,2,2025-01-01 07:31:39,48.805277,2.237711,1
...,...,...,...,...,...
3283,46,2025-01-31 20:15:20,48.924107,2.363829,5
3284,46,2025-01-31 18:20:32,48.925316,2.365958,5
3285,48,2025-01-31 09:54:17,48.934848,2.450016,5
3286,49,2025-01-31 19:16:42,48.891441,2.297180,5


In [3]:
# Generate anonymized id
def anonymize_id(user_id, week): 
    """Generate an anonymized id from the user_id and week"""
    pseudo = f"{user_id}_{week}"
    hashed_id =  hashlib.sha256(pseudo.encode()).hexdigest()[:10]
    return hashed_id  

original_data['anonymized_id'] = original_data.apply(
    lambda row: anonymize_id(row['id'], row['week']), axis=1
)   

In [4]:
# Standardiz latitude and longitude for clustering
scaler = StandardScaler()
coords = original_data[['latitude', 'longitude']]
scaled_coords = scaler.fit_transform(coords)

In [5]:
# Apply KMeans clustering for K-Anonymity
k = 10
kmeans = KMeans(n_clusters=int(len(coords)/k), random_state=42)
original_data['cluster'] = kmeans.fit_predict(scaled_coords)
original_data

Unnamed: 0,id,date,latitude,longitude,week,anonymized_id,cluster
0,1,2025-01-01 07:23:58,48.896039,2.209185,1,551e8b15be,229
1,1,2025-01-01 13:29:59,48.895000,2.210958,1,551e8b15be,10
2,1,2025-01-01 15:01:43,48.893924,2.211572,1,551e8b15be,10
3,2,2025-01-01 11:23:57,48.805277,2.237711,1,420cf95170,202
4,2,2025-01-01 07:31:39,48.805277,2.237711,1,420cf95170,202
...,...,...,...,...,...,...,...
3283,46,2025-01-31 20:15:20,48.924107,2.363829,5,788bb84b31,17
3284,46,2025-01-31 18:20:32,48.925316,2.365958,5,788bb84b31,250
3285,48,2025-01-31 09:54:17,48.934848,2.450016,5,a5ae4a2e22,13
3286,49,2025-01-31 19:16:42,48.891441,2.297180,5,e9c255ad7f,283


In [6]:
# Replace latitude and longitude with cluster centroids
centroids = scaler.inverse_transform(kmeans.cluster_centers_)
original_data['anonymized_latitude'] = original_data['cluster'].apply(lambda x: centroids[x][0])
original_data['anonymized_longitude'] = original_data['cluster'].apply(lambda x: centroids[x][1])

In [7]:
# Save anonymized data
anonymized_data = original_data[['anonymized_id', 'date', 'anonymized_latitude', 'anonymized_longitude']]
anonymized_data.columns = ['id', 'date', 'latitude', 'longitude']
anonymized_data

Unnamed: 0,id,date,latitude,longitude
0,551e8b15be,2025-01-01 07:23:58,48.895989,2.209185
1,551e8b15be,2025-01-01 13:29:59,48.895186,2.210743
2,551e8b15be,2025-01-01 15:01:43,48.895186,2.210743
3,420cf95170,2025-01-01 11:23:57,48.805277,2.237711
4,420cf95170,2025-01-01 07:31:39,48.805277,2.237711
...,...,...,...,...
3283,788bb84b31,2025-01-31 20:15:20,48.924328,2.363939
3284,788bb84b31,2025-01-31 18:20:32,48.925316,2.365958
3285,a5ae4a2e22,2025-01-31 09:54:17,48.934117,2.450221
3286,e9c255ad7f,2025-01-31 19:16:42,48.891526,2.297208


In [8]:
from zip import zip_csv_file
# anonymized_data.to_csv('../file_ano/k_anonymized_data.csv', index=False, sep='\t', header=False)
# zip_csv_file('../file_ano/k_anonymized_data.csv', '../file_ano/k_anonymized_data.zip')
anonymized_data.to_csv('../file_ano/k_anonymized_big_survey_data.csv', index=False, sep='\t', header=False)
zip_csv_file('../file_ano/k_anonymized_big_survey_data.csv', '../file_ano/k_anonymized_big_survey_data.zip')

Successfully created D:/INSA/semetre 7/projet/Anonym/INSAnonym-master-serv/INSAnonym-master/scripts/metrics/anonymisation/file_origin/big_survey_results.zip
Successfully created ../file_ano/k_anonymized_big_survey_data.zip


True