In [1]:
import torch 
import numpy as np

import math

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

import shapely
from shapely import wkt
from tqdm import tqdm

from joblib import Parallel, delayed

import trackintel as ti
from trackintel.geogr.distances import calculate_distance_matrix
from sklearn.preprocessing import OrdinalEncoder

In [2]:
ti.__version__

'1.2.4'

In [3]:
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from loc_predict.models.markov import markov_transition_prob
from utils.utils import load_data

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Generate datasets

In [5]:
src_min_days = 7 * 2
src_max_days = 7 * 3

tgt_min_days = 7 * 1
tgt_max_days = 7 * 2

type = "small"

In [6]:
sp = pd.read_csv(os.path.join(f"../data/sp_{type}.csv"), index_col="id")
loc = pd.read_csv(os.path.join("../data/loc_s2_level10_13.csv"), index_col="id")

sp = load_data(sp, loc)

# get all possible locations
all_locs = pd.read_csv("../data/s2_loc_visited_level10_13.csv", index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")
# transform to projected coordinate systems
all_locs = all_locs.to_crs("EPSG:2056")



In [7]:
def get_train_test(sp, all_locs):
    sp.sort_values(by=["user_id", "start_day", "start_min"], inplace=True)
    sp.drop(columns={"started_at", "finished_at"}, inplace=True)

    # encoder user, 0 reserved for padding
    enc = OrdinalEncoder(dtype=np.int64)
    sp["user_id"] = enc.fit_transform(sp["user_id"].values.reshape(-1, 1)) + 1

    # truncate too long duration, >2 days to 2 days
    sp.loc[sp["act_duration"] > 60 * 24 * 2 - 1, "act_duration"] = 60 * 24 * 2 - 1

    # split the datasets, user dependent 0.6, 0.2, 0.2
    train_data, vali_data, test_data = _split_dataset(sp)

    # encode unseen locations in validation and test into 0
    enc = OrdinalEncoder(dtype=np.int64, handle_unknown="use_encoded_value", unknown_value=-1).fit(
        all_locs["loc_id"].values.reshape(-1, 1)
    )
    # add 2 to account for 0 padding and seperation (1)
    all_locs["loc_id"] = enc.transform(all_locs["loc_id"].values.reshape(-1, 1)) + 2

    train_data["location_id"] = enc.transform(train_data["location_id"].values.reshape(-1, 1)) + 2
    vali_data["location_id"] = enc.transform(vali_data["location_id"].values.reshape(-1, 1)) + 2
    test_data["location_id"] = enc.transform(test_data["location_id"].values.reshape(-1, 1)) + 2

    return train_data, vali_data, test_data, all_locs


def _split_dataset(totalData):
    """Split dataset into train, vali and test."""

    def getSplitDaysUser(df):
        """Split the dataset according to the tracked day of each user."""
        maxDay = df["start_day"].max()
        train_split = maxDay * 0.6
        vali_split = maxDay * 0.8

        df["Dataset"] = "test"
        df.loc[df["start_day"] < train_split, "Dataset"] = "train"
        df.loc[
            (df["start_day"] >= train_split) & (df["start_day"] < vali_split),
            "Dataset",
        ] = "vali"

        return df

    totalData = totalData.groupby("user_id", group_keys=False).apply(getSplitDaysUser)

    train_data = totalData.loc[totalData["Dataset"] == "train"].copy()
    vali_data = totalData.loc[totalData["Dataset"] == "vali"].copy()
    test_data = totalData.loc[totalData["Dataset"] == "test"].copy()

    # final cleaning
    train_data.drop(columns={"Dataset"}, inplace=True)
    vali_data.drop(columns={"Dataset"}, inplace=True)
    test_data.drop(columns={"Dataset"}, inplace=True)

    return train_data, vali_data, test_data

train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

In [8]:
train_data["id"] = np.arange(len(train_data))
vali_data["id"] = np.arange(len(vali_data))
test_data["id"] = np.arange(len(test_data))

In [18]:
f"Max loc id {all_locs.loc_id.max()}, min loc id {all_locs.loc_id.min()}, unique loc id:{all_locs.loc_id.unique().shape[0]}"

'Max loc id 14882, min loc id 2, unique loc id:14881'

In [10]:
def getValidSequenceUser(df):

    data_ls = []
    df.reset_index(drop=True, inplace=True)

    min_days = df["start_day"].min()
    df["diff_day"] = df["start_day"] - min_days

    for index, row in df.iterrows():
        # exclude the first records
        if row["diff_day"] < src_min_days:
            continue

        src_trace = df.iloc[: index + 1]
        src_trace = src_trace.loc[(src_trace["start_day"] >= (row["start_day"] - src_max_days))]

        tgt_trace = df.iloc[index + 1: ]
        tgt_trace = tgt_trace.loc[(tgt_trace["start_day"] <= (row["start_day"] + tgt_max_days))]

        if ((tgt_trace["start_day"].max() - tgt_trace["start_day"].min()) < tgt_min_days) or len(tgt_trace) == 0:
            continue

        curr_dict = {}
        curr_dict["src"] = src_trace["location_id"].values
        curr_dict["src_duration"] = src_trace["act_duration"].values.astype(int)
        curr_dict["src_user"] = src_trace["user_id"].values[0]
        curr_dict["src_weekday"] = src_trace["weekday"].values
        curr_dict["src_startmin"] = src_trace["start_min"].values


        curr_dict["tgt"] = tgt_trace["location_id"].values
        curr_dict["tgt_duration"] = tgt_trace["act_duration"].values.astype(int)
        curr_dict["tgt_weekday"] = tgt_trace["weekday"].values
        curr_dict["tgt_startmin"] = tgt_trace["start_min"].values

        data_ls.append(curr_dict)

    return data_ls

def applyParallel(dfGrouped, func, n_jobs, print_progress=True, **kwargs):
    return Parallel(n_jobs=n_jobs)(
        delayed(func)(group, **kwargs) for _, group in tqdm(dfGrouped, disable=not print_progress)
    )

def get_valid_data(df):
    valid_data = applyParallel(df.groupby("user_id"),getValidSequenceUser, n_jobs=-1)
    return [item for sublist in valid_data for item in sublist]

In [11]:
valid_train_data = get_valid_data(train_data)
valid_validation_data = get_valid_data(vali_data)
valid_test_data = get_valid_data(test_data)


100%|██████████| 500/500 [00:17<00:00, 29.04it/s]
100%|██████████| 498/498 [00:02<00:00, 218.07it/s] 
100%|██████████| 500/500 [00:02<00:00, 197.92it/s] 


In [12]:
len(valid_train_data), len(valid_validation_data), len(valid_test_data)

(125954, 28603, 29971)

In [13]:
def get_max_len(ls):

    max_src_len = 0
    max_tgt_len = 0
    for seq in ls:
        seq_len = len(seq["src"])
        tgt_len = len(seq["tgt"])

        if seq_len > max_src_len:
            max_src_len = seq_len
        if tgt_len > max_tgt_len:
            max_tgt_len = tgt_len

    return max_src_len, max_tgt_len

get_max_len(valid_train_data), get_max_len(valid_validation_data), get_max_len(valid_test_data)

((200, 156), (215, 160), (211, 132))

In [14]:
save_pk_file(os.path.join("..", "data", "diff", f"train_level10_13_{src_min_days}_{tgt_min_days}_{type}.pk"), valid_train_data)

save_pk_file(os.path.join("..", "data", "diff", f"valid_level10_13_{src_min_days}_{tgt_min_days}_{type}.pk"), valid_validation_data)

save_pk_file(os.path.join("..", "data", "diff", f"test_level10_13_{src_min_days}_{tgt_min_days}_{type}.pk"), valid_test_data)


## For test reading

In [16]:
loaded = pickle.load(open(os.path.join("..", "data", "diff", f"train_level10_13_{src_min_days}_{tgt_min_days}_tiny.pk"), "rb"))
loaded[0]


{'src': array([10700, 10700, 10516, 10516, 10516, 10700, 11214, 10700, 10700,
        10516, 10700, 10516, 10700, 10700, 10700, 10516, 10700, 11208,
        10516, 11208, 10516, 10517, 10700, 10516, 10700, 10516, 10700,
        10700, 10516], dtype=int64),
 'src_duration': array([ 692,   74,   69,  100,   39,  517,  665, 2324,  455,  626,  823,
         599, 2603,  216,  891,  624,  215,  548,  603,  837,  411,  169,
         918,  616,  811,  626,  910, 1436,  680]),
 'src_user': 1,
 'src_weekday': array([3, 4, 4, 4, 4, 4, 4, 5, 6, 0, 0, 1, 1, 3, 3, 4, 4, 4, 5, 5, 6, 6,
        6, 0, 0, 1, 1, 2, 3]),
 'src_startmin': array([1147,  411,  519,  563,  647,  728, 1224,  448, 1329,  400, 1016,
         403, 1016,  817, 1184,  402, 1017, 1189,  324,  920,  319,  693,
         924,  403, 1015,  405, 1015,  757,  525]),
 'tgt': array([10700, 10516, 10686, 10700, 11208, 10700, 10516, 10516, 10700,
        10700, 12615, 10700, 10516, 10516, 10700, 10516, 10700, 11208,
        10516, 11541, 1120

# Pairwise distance matrix

In [14]:
visited_locs = pd.read_csv("../data/s2_loc_visited_level10_13.csv", index_col="id").sort_values(by="loc_id")
visited_locs["geometry"] = visited_locs["geometry"].apply(wkt.loads)
visited_locs = gpd.GeoDataFrame(visited_locs, geometry="geometry", crs="EPSG:4326")

visited_locs = visited_locs.to_crs("EPSG:2056")

In [15]:
visited_locs.head()

Unnamed: 0_level_0,loc_id,level,geometry,freq,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5152981090339651584,10,POINT (2824875.045 1159436.528),1,POLYGON ((10.405097146225662 46.50637570678295...
1,5152984577853095936,13,POINT (2822437.857 1167754.527),38,POLYGON ((10.347820052132302 46.61871685680636...
2,5152985265047863296,13,POINT (2815796.126 1167304.569),1,POLYGON ((10.260953592346382 46.61685460820135...
3,5152985299407601664,13,POINT (2816020.303 1168362.492),1,POLYGON ((10.264376293749178 46.62629143145975...
4,5152985505566031872,13,POINT (2818994.419 1169160.037),1,POLYGON ((10.303565033130825 46.63249007077394...


In [16]:
def calculate_distance_matrix(X):
    X = shapely.get_coordinates(X.geometry)

    distance = pdist(X, 'euclidean')
    dist_matrix = squareform(distance)
    return dist_matrix
    
dist_matrix = calculate_distance_matrix(visited_locs)

In [17]:
dist_matrix.shape

(14881, 14881)

In [18]:
save_pk_file("../data/matrix/distance_13.pk", dist_matrix)