In [2]:
import pandas as pd
import geopandas as gpd
import pickle
from tqdm import tqdm
from src.data.hex_utils import hexagonize, interpolate_cell_jumps

In [2]:
with open("../data/processed/geolife.pkl", "rb") as f:
    gdf = pickle.load(f)
gdf

Unnamed: 0,lat,lon,datetime,trajectory,user,t_idx,timediff,x,y,dist,speed
312042,39.984702,116.318417,2008-10-23 02:53:04,0,0,1,NaT,1.995438e+07,4.441405e+06,0.000000,
312043,39.984683,116.318450,2008-10-23 02:53:10,0,0,1,0 days 00:00:06,1.995438e+07,4.441403e+06,3.529644,0.588274
312044,39.984686,116.318417,2008-10-23 02:53:15,0,0,1,0 days 00:00:05,1.995438e+07,4.441403e+06,2.845456,0.569091
312045,39.984688,116.318385,2008-10-23 02:53:20,0,0,1,0 days 00:00:05,1.995437e+07,4.441403e+06,2.749191,0.549838
312046,39.984655,116.318263,2008-10-23 02:53:25,0,0,1,0 days 00:00:05,1.995436e+07,4.441399e+06,11.073900,2.214780
...,...,...,...,...,...,...,...,...,...,...,...
12420282,39.866483,116.415383,2007-12-08 07:09:44,12,181,17118,0 days 00:01:05,1.996346e+07,4.428769e+06,53.807437,0.827807
12420283,39.866033,116.415683,2007-12-08 07:11:20,12,181,17118,0 days 00:01:36,1.996349e+07,4.428720e+06,56.320576,0.586673
12420284,39.865550,116.415733,2007-12-08 07:12:42,12,181,17118,0 days 00:01:22,1.996350e+07,4.428667e+06,53.978145,0.658270
12420285,39.864883,116.415750,2007-12-08 07:14:02,12,181,17118,0 days 00:01:20,1.996351e+07,4.428593e+06,74.230941,0.927887


In [17]:
crs = "epsg:2333"
lonlimits = [116.1, 116.7]
latlimits = [39.7, 40.1]
n_rows = 50

limits = gpd.points_from_xy(x=lonlimits, y=latlimits, crs="epsg:4326")
limits = limits.to_crs(crs)

In [34]:
hdf = hexagonize(gdf, n_rows=n_rows, limits=limits)

In [36]:
import numpy as np
from src.data.hex_utils import cell_distance, cell_round


def interpolate_cell_jumps(df):
    qr1 = df[["q", "r"]].shift(1).values
    qr2 = df[["q", "r"]].values

    df["cell_dist"] = cell_distance(qr1, qr2)
    df["cell_dist"] = df["cell_dist"].fillna(1)
    if not (df["cell_dist"] > 1).any():
        return df[df["cell_dist"] >= 1]
    else:
        t1 = df[["datetime"]].shift(1).values
        t2 = df[["datetime"]].values

        qri = []
        ti = []

        for i, ni in enumerate(df["cell_dist"]):
            if not np.isnan(ni) and ni > 1:
                t = np.arange(1, ni) / ni
                ti.append(t1[i] + (t2[i] - t1[i]) * t[:, None])
                qri.append(qr1[i] + (qr2[i] - qr1[i]) * t[:, None])
        new_rows = pd.DataFrame(cell_round(np.concatenate(qri)), columns=["q", "r"])
        new_rows["datetime"] = np.concatenate(ti)
        new_rows["t_idx"] = df.iloc[0]["t_idx"]
        new_rows["user"] = df.iloc[0]["user"]
        df = df[df["cell_dist"] >= 1]
        df_new = pd.concat([df, new_rows]).sort_values("datetime")
        qr1 = df_new[["q", "r"]].shift(1).values
        qr2 = df_new[["q", "r"]].values
        df_new["cell_dist"] = cell_distance(qr1, qr2)
        df_new["cell_dist"] = df_new["cell_dist"].fillna(1)
        if (df_new["cell_dist"] != 1).any():
            print("Cell dist != 1")
        return df_new

In [37]:
tmp = []
for t_idx, dft in tqdm(hdf.groupby("t_idx")):
    tmp.append(interpolate_cell_jumps(dft))

tdf = pd.concat(tmp)
tdf

100%|██████████| 16829/16829 [02:28<00:00, 112.98it/s]


Unnamed: 0,q,r,datetime,user,t_idx,cell_dist
312042,21,24,2008-10-23 02:53:04.000000000,0,1,1.0
312054,20,25,2008-10-23 02:54:05.000000000,0,1,1.0
312109,19,25,2008-10-23 02:58:40.000000000,0,1,1.0
0,18,26,2008-10-23 03:36:41.000000000,0,1,1.0
312190,17,27,2008-10-23 04:08:07.000000000,0,1,1.0
...,...,...,...,...,...,...
16,31,7,2007-12-08 05:51:36.153846153,181,17118,1.0
17,31,6,2007-12-08 05:55:15.076923076,181,17118,1.0
12420255,32,5,2007-12-08 05:58:54.000000000,181,17118,1.0
12420265,32,6,2007-12-08 06:10:43.000000000,181,17118,1.0


In [38]:
tdf[tdf['cell_dist'] == 1]

Unnamed: 0,q,r,datetime,user,t_idx,cell_dist
312042,21,24,2008-10-23 02:53:04.000000000,0,1,1.0
312054,20,25,2008-10-23 02:54:05.000000000,0,1,1.0
312109,19,25,2008-10-23 02:58:40.000000000,0,1,1.0
0,18,26,2008-10-23 03:36:41.000000000,0,1,1.0
312190,17,27,2008-10-23 04:08:07.000000000,0,1,1.0
...,...,...,...,...,...,...
16,31,7,2007-12-08 05:51:36.153846153,181,17118,1.0
17,31,6,2007-12-08 05:55:15.076923076,181,17118,1.0
12420255,32,5,2007-12-08 05:58:54.000000000,181,17118,1.0
12420265,32,6,2007-12-08 06:10:43.000000000,181,17118,1.0


In [41]:
tdf.groupby("t_idx").size().describe()

count    16829.000000
mean        21.916929
std         27.879710
min          1.000000
25%          4.000000
50%         13.000000
75%         27.000000
max        438.000000
dtype: float64

In [15]:
qr1 = tdf[["q", "r"]].shift(1).values
qr2 = tdf[["q", "r"]].values
np.unique(np.nan_to_num(cell_distance(qr1, qr2), nan=1))

array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
       13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25.,
       26., 27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38.,
       39., 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51.,
       52., 53., 55., 57., 58., 62., 66., 67.])

In [44]:
hdf = tdf
hdf = hdf.groupby("t_idx").filter(lambda x: len(x) >= 3)

In [45]:
hdf['is_workday'] = hdf['datetime'].apply(lambda x: x.weekday() < 5)
hour_thresholds = range(0, 25, 6)

In [46]:
for idx in range(len(hour_thresholds) - 1):
    hdf[f"is_in_time_{idx}"] = hdf["datetime"].apply(
        lambda x: hour_thresholds[idx] < x.hour <= hour_thresholds[idx + 1]
    )

In [10]:
hdf.to_pickle(f"../data/processed/geolife_hex_50.pkl")

In [5]:
with open("../data/processed/geolife_hex_50.pkl", 'rb') as f:
    hdf = pickle.load(f)

In [7]:
q_max = hdf["q"].max()
tmp = []
r_max = hdf["r"].max()
for t_idx, dfi in hdf.groupby("t_idx"):
    r_idx = dfi.columns.get_loc("r")
    dfi.iloc[0, r_idx] = r_max + 1
    dfi.iloc[-1, r_idx] = r_max + 2
    tmp.append(dfi)
hdf = pd.concat(tmp)

In [4]:
hdf.groupby("user").size()

user
0      3902
1      1555
2      4447
3      9294
4      9451
       ... 
176      61
177      38
179    1988
180      25
181     166
Length: 175, dtype: int64

In [38]:
with open("../data/processed/geolife_hex_50.pkl", "rb") as f:
    hdf = pickle.load(f)

# Add start and end coordinates to each trajectory
tmp = []
q_max = hdf["q"].max()
r_max = hdf["r"].max()
for t_idx, dfi in hdf.groupby("t_idx"):
    dfnew = pd.concat([dfi.head(1), dfi, dfi.tail(1)])
    q_idx = dfnew.columns.get_loc("q")
    r_idx = dfnew.columns.get_loc("r")
    dfnew.iloc[0, q_idx] = q_max + 1
    dfnew.iloc[-1, q_idx] = q_max + 2
    dfnew.iloc[0, r_idx] = r_max + 1
    dfnew.iloc[-1, r_idx] = r_max + 2
    tmp.append(dfnew)
hdf = pd.concat(tmp)

hdf

Unnamed: 0,q,r,datetime,user,t_idx,cell_dist,is_workday,is_in_time_0,is_in_time_1,is_in_time_2,is_in_time_3
312042,-1,24,2008-10-23 02:53:04.000000000,0,1,1.0,True,True,False,False,False
312042,21,24,2008-10-23 02:53:04.000000000,0,1,1.0,True,True,False,False,False
312054,20,25,2008-10-23 02:54:05.000000000,0,1,1.0,True,True,False,False,False
312109,19,25,2008-10-23 02:58:40.000000000,0,1,1.0,True,True,False,False,False
0,18,26,2008-10-23 03:36:41.000000000,0,1,1.0,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
17,31,6,2007-12-08 05:55:15.076923076,181,17118,1.0,False,True,False,False,False
12420255,32,5,2007-12-08 05:58:54.000000000,181,17118,1.0,False,True,False,False,False
12420265,32,6,2007-12-08 06:10:43.000000000,181,17118,1.0,False,True,False,False,False
12420277,32,5,2007-12-08 07:03:03.000000000,181,17118,1.0,False,False,True,False,False


In [11]:
import torch

weights = torch.tensor(
    [[0, 10], [3, 0]], dtype=torch.float
)  # create a tensor of weights
torch.multinomial(weights, 1)

tensor([[1],
        [0]])

In [None]:
with open