In [1]:
import re

import pandas as pd

In [2]:
ucc = pd.read_csv("../output/user_cell_count.csv", engine="pyarrow")

In [3]:
def top4(data: pd.DataFrame) -> list[int]:
    top4 = data.sort_values("count", ascending=False).head(4)
    return tuple(zip(top4["x"], top4["y"]))


# ucc[:10000].groupby("uid").apply(top4, include_groups=False)
top4_lists = ucc.groupby("uid").apply(top4, include_groups=False)

In [4]:
data = pd.read_csv("../data/yjmob100k/yjmob100k-dataset1.csv", engine="pyarrow")
data.head(3)

Unnamed: 0,uid,d,t,x,y
0,0,0,1,79,86
1,0,0,2,79,86
2,0,0,8,77,86


In [5]:
def create_rescale_lookup(k: int = 2) -> pd.DataFrame:
    lookup = {}
    for idx, x in enumerate(range(200)):
        for idy, y in enumerate(range(200)):
            lookup[(x + 1, y + 1)] = ((x // k) + 1, (y // k) + 1)
    return lookup


r2 = create_rescale_lookup(2)
r4 = create_rescale_lookup(4)
r8 = create_rescale_lookup(8)
r16 = create_rescale_lookup(16)
r32 = create_rescale_lookup(32)

In [6]:
try:
    ucc_r2 = pd.read_csv("../output/user_cell_count_r2.csv", engine="pyarrow")
except FileNotFoundError:
    data_r2 = data.copy()

    data_r2["x"] = (data_r2["x"] // 2) + 1
    data_r2["y"] = (data_r2["y"] // 2) + 1

    ucc_r2 = data_r2.groupby(["uid", "x", "y"])["t"].count().reset_index()
    ucc_r2.columns = ["uid", "x", "y", "count"]
    ucc_r2.to_csv("../output/user_cell_count_r2.csv", index=False)

In [7]:
try:
    ucc_r4 = pd.read_csv("../output/user_cell_count_r4.csv", engine="pyarrow")
except FileNotFoundError:
    data_r4 = data.copy()

    data_r4["x"] = (data_r4["x"] // 4) + 1
    data_r4["y"] = (data_r4["y"] // 4) + 1

    ucc_r4 = data_r4.groupby(["uid", "x", "y"])["t"].count().reset_index()
    ucc_r4.columns = ["uid", "x", "y", "count"]
    ucc_r4.to_csv("../output/user_cell_count_r4.csv", index=False)

In [8]:
try:
    ucc_r8 = pd.read_csv("../output/user_cell_count_r8.csv", engine="pyarrow")
except FileNotFoundError:
    data_r8 = data.copy()

    data_r8["x"] = (data_r8["x"] // 8) + 1
    data_r8["y"] = (data_r8["y"] // 8) + 1

    ucc_r8 = data_r8.groupby(["uid", "x", "y"])["t"].count().reset_index()
    ucc_r8.columns = ["uid", "x", "y", "count"]
    ucc_r8.to_csv("../output/user_cell_count_r8.csv", index=False)

In [9]:
def foo(x: list, lookup: dict[tuple[int, int], tuple[int, int]]) -> int:
    return len(set([lookup[i] for i in x]))


r2_idc = top4_lists.apply(lambda x: foo(x, r2)).reset_index()
r2_idc.columns = ["uid", "r2_identifiable_cell_count"]
r4_idc = top4_lists.apply(lambda x: foo(x, r4)).reset_index()
r4_idc.columns = ["uid", "r4_identifiable_cell_count"]
r8_idc = top4_lists.apply(lambda x: foo(x, r8)).reset_index()
r8_idc.columns = ["uid", "r8_identifiable_cell_count"]
r16_idc = top4_lists.apply(lambda x: foo(x, r16)).reset_index()
r16_idc.columns = ["uid", "r16_identifiable_cell_count"]
r32_idc = top4_lists.apply(lambda x: foo(x, r32)).reset_index()
r32_idc.columns = ["uid", "r32_identifiable_cell_count"]

In [10]:
m = (
    r2_idc.merge(r4_idc, on="uid")
    .merge(r8_idc, on="uid")
    .merge(r16_idc, on="uid")
    .merge(r32_idc, on="uid")
)
m.head(3)

Unnamed: 0,uid,r2_identifiable_cell_count,r4_identifiable_cell_count,r8_identifiable_cell_count,r16_identifiable_cell_count,r32_identifiable_cell_count
0,0,4,3,3,3,2
1,1,3,3,2,1,1
2,2,4,3,2,1,1


In [11]:
m.groupby(["r2_identifiable_cell_count"])["uid"].count()

r2_identifiable_cell_count
1      721
2    15582
3    48228
4    35469
Name: uid, dtype: int64

In [12]:
m.groupby(["r4_identifiable_cell_count"])["uid"].count()

r4_identifiable_cell_count
1     6247
2    38548
3    42323
4    12882
Name: uid, dtype: int64

In [13]:
m.groupby(["r8_identifiable_cell_count"])["uid"].count()

r8_identifiable_cell_count
1    15466
2    50987
3    28457
4     5090
Name: uid, dtype: int64

In [14]:
result = pd.DataFrame(
    {
        "different_cells": range(1, 5),
        "r2": m.groupby(["r2_identifiable_cell_count"])["uid"].count().tolist(),
        "r4": m.groupby(["r4_identifiable_cell_count"])["uid"].count().tolist(),
        "r8": m.groupby(["r8_identifiable_cell_count"])["uid"].count().tolist(),
        "r16": m.groupby(["r16_identifiable_cell_count"])["uid"].count().tolist(),
        "r32": m.groupby(["r32_identifiable_cell_count"])["uid"].count().tolist(),
    }
).sort_values("different_cells", ascending=False)
result

Unnamed: 0,different_cells,r2,r4,r8,r16,r32
3,4,35469,12882,5090,1810,470
2,3,48228,42323,28457,16752,7438
1,2,15582,38548,50987,52608,44939
0,1,721,6247,15466,28830,47153


In [55]:
result.to_csv("../output/identifiable_users.csv", index=False)

In [104]:
with open("../paper/top4_table.tex", "w") as fp:
    t = result.to_latex(
        index=False,
        header=[
            "distinguishable cells",
            "1 km x 1 km",
            "2 km x 2 km",
            "4 km x 4 km",
            "8 km x 8 km",
            "16 km x 16 km",
        ],
        position="t",
        label="tab:top4",
        caption="Comparison of the top-four-location identifiable users by upscaled grids.",
        # buf=fp
    )
    t = re.sub("\\\\begin{table}\\[t\\]", "\\\\begin{table}[t]\n\\\\centering", t)
    print(t, file=fp)

In [22]:
mdt = result.copy()
mdt.columns = [
    "distinguishable cells",
    "1 km x 1 km",
    "2 km x 2 km",
    "4 km x 4 km",
    "8 km x 8 km",
    "16 km x 16 km",
]
print(
    mdt.to_markdown(
        index=False,
    )
)

|   distinguishable cells |   1 km x 1 km |   2 km x 2 km |   4 km x 4 km |   8 km x 8 km |   16 km x 16 km |
|------------------------:|--------------:|--------------:|--------------:|--------------:|----------------:|
|                       4 |         35469 |         12882 |          5090 |          1810 |             470 |
|                       3 |         48228 |         42323 |         28457 |         16752 |            7438 |
|                       2 |         15582 |         38548 |         50987 |         52608 |           44939 |
|                       1 |           721 |          6247 |         15466 |         28830 |           47153 |
