In [11]:
import pandas as pd
import numpy as np
import folium

In [12]:
file_path = 'sample'
data = pd.read_csv(file_path, sep='\t', names = ["id", "timestamp", "long", "lat"])
data.head()

Unnamed: 0,id,timestamp,long,lat
0,1,1425425716072,4.870147,45.77214
1,1,1425425748063,4.870218,45.772095
2,1,1425425749622,4.87021,45.772072
3,1,1425425750644,4.87021,45.772072
4,1,1425425752621,4.87021,45.772072


### Looking at the data

In [13]:
def plot_map(data,lat_name='lat',long_name='long'):
    mean_position = (data[lat_name].mean(), data[long_name].mean())
    m = folium.Map(location=mean_position, zoom_start=17)
    
    for _, group in data.groupby('id'):
        coordinates = list(zip(group[lat_name], group[long_name]))
        folium.PolyLine(coordinates).add_to(m)
    
    return m

plot_map(data)

In [14]:
square_quant_bucket_size = 0.000009
square_quant_obfusc_size = 0.00001
perturb_factor = 0.01

## Removing a third of the points
since the points are quite close in time, we don't loose too much information in deleting one point every three

In [15]:
def remove_fraction(data, fraction_amount):
    data = data[data.index % fraction_amount != 0]

#remove_fraction(data, 3)
#plot_map(data["lat"], data["long"])

## Quantize the points (buckets)

In [16]:
# Quantize the latitude and longitude
def quantize (df, square_size):
    df['lat_q'] = np.floor(df['lat'] / square_size) * square_size
    df['long_q'] = np.floor(df['long'] / square_size) * square_size
    return df

data = quantize(data,square_quant_bucket_size)
print(f"number of colums: {data.shape[0]}")

number of colums: 4000


### Remove points in the same bucket

In [17]:
data = data.groupby('id',group_keys=False).apply(lambda x: x.drop_duplicates(subset=['lat_q', 'long_q']))
#print(data.head())
print(f"number of colums: {data.shape[0]}")
plot_map(data,"lat_q","long_q")

number of colums: 678


In [18]:
def obfuscate_traces(data: pd.DataFrame, id1, id2, square_size):
    """
    Obfuscates the traces in the given DataFrame by swapping the paths 
    of the two given IDs if they cross each other.

    note: assumes data is ordered temporally
    Args:
        data (pd.DataFrame): The DataFrame containing the traces.
        id1: The first ID to swap.
        id2: The second ID to swap.
        square_size: The size of the square used for quantization.

    Returns:
        pd.DataFrame: The obfuscated DataFrame.
    """
    # Get the rows for the given IDs
    data = quantize(data, square_size)
    rows_id1 = data[data['id'] == id1]
    rows_id2 = data[data['id'] == id2]
    swapped = False

    for idx1, row_id1 in rows_id1.iterrows():
        for idx2, row_id2 in rows_id2.iterrows():
            if (row_id1['lat_q'], row_id1['long_q']) == (row_id2['lat_q'] ,row_id2['long_q']):
            #if (row_id1['lat'], row_id1['long']) == (row_id2['lat'] ,row_id2['long']):
                
                data.loc[data.index >= idx1,'id'] = id2
                data.loc[data.index >= idx2,'id'] = id1
                
                data.sort_values(by=['id'], inplace=True)

                swapped = True; break
        if swapped : break

    data.drop(['lat_q', 'long_q'], axis=1, inplace=True)
    return data

# obfuscate the traces for all couple of indexes in the dataset using enumeration
id_list = data['id'].unique().tolist()
for i in range(len(id_list)):
    for j in range(i+1, len(id_list)):
        data = obfuscate_traces(data, id_list[i], id_list[j], square_quant_obfusc_size)


## Add random perturbation

In [19]:
def perturbe(group):
    lat_range = group['lat'].max() - group['lat'].min()
    long_range = group['long'].max() - group['long'].min()

    # Add random noise to lat_q and long_q
    group['lat'] = group['lat'] + np.random.normal(0, scale=lat_range * perturb_factor, size=len(group))
    group['long'] = group['long'] + np.random.normal(0, scale=long_range * perturb_factor, size=len(group))
    return group

data = data.groupby('id',group_keys=False).apply(perturbe)

print(data.head())
plot_map(data,"lat","long")

    id      timestamp      long        lat
0    1  1425425716072  4.870241  45.772131
1    1  1425425748063  4.870174  45.772076
2    1  1425425749622  4.870192  45.772074
29   1  1425425786648  4.870162  45.772159
30   1  1425425788621  4.870074  45.772183
