# Home detection

Select the evening/nighttime interval that is considered home-time.
Time dimension is discretized into 30 minutes intervals and indexed from 0 to 47.
The home location (cell) is defined as the cell where a user appears the most between 20:00 (40) and 08:00 (16).  

In [1]:
import pandas as pd


def determine_most_frequent_cell(x: pd.DataFrame) -> tuple[int, int]:
    count = (
        x.groupby(["x", "y"])["count"]
        .sum()
        .reset_index()
        .sort_values("count", ascending=False)
        .reset_index(drop=True)
    )
    most = count["x"][0], count["y"][0]
    return most


def detemmine_homes(data: pd.DataFrame) -> pd.DataFrame:
    records = []
    for uid, d in data.groupby(data["uid"]):
        records.append([uid, *determine_most_frequent_cell(d)])
    return pd.DataFrame.from_records(records, columns=["uid", "x", "y"])

In [2]:
mobility_data = "../data/yjmob100k"
output = "../output"

In [3]:
try:
    loc_count = pd.read_csv(f"{output}/loc_count.csv", engine="pyarrow")
except FileNotFoundError:
    data = pd.read_csv(
        f"{mobility_data}/yjmob100k-dataset1.csv",
        engine="pyarrow",
    )
    loc_count = data.groupby(["uid", "t", "x", "y"])["d"].count().reset_index()
    loc_count.rename({"d": "count"}, axis=1, inplace=True)
    loc_count.to_csv(f"{output}/loc_count.csv", index=False)

In [4]:
evening = loc_count.query("t < 16 | t > 42").copy()

In [5]:
try:
    homes = pd.read_csv(f"{output}/homes.csv", engine="pyarrow")
except FileNotFoundError:
    homes = detemmine_homes(evening)
    homes.to_csv(f"{output}/homes.csv", index=False)

## Optionally estimate workplaces

In [6]:
work_hours = loc_count.query("t > 18 & t < 36").copy()

In [7]:
try:
    workplaces = pd.read_csv(f"{output}/workplaces.csv", engine="pyarrow")
except FileNotFoundError:
    workplaces = detemmine_homes(work_hours)
    workplaces.to_csv(f"{output}/workplaces.csv", index=False)