In [None]:
import torch 
import numpy as np

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

# from loc_predict.processing import _split_train_test
from utils.utils import load_data
from shapely import wkt

import powerlaw
import matplotlib.pyplot as plt

from metrics.metrics import radius_gyration, jump_length, location_frquency, wait_time

data_dir = os.path.join("data", "validation")

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
x = torch.tensor([[12.9382, 11.2081, 11.0942],
        [12.9211, 11.1199, 11.0261],
        [14.3441, 13.3287, 12.6234],
        [13.7739, 13.2769, 12.5926]], device='cuda:0')

In [None]:
p = torch.cumsum(x / x.sum(dim=-1, keepdim=True), dim=-1)

In [None]:
idx = torch.searchsorted(p, torch.rand([p.shape[0], 1]).to("cuda:0"))

In [None]:
idx

In [None]:
idx.shape

In [None]:
x.shape

In [None]:
x.gather(dim=1, index=idx)

In [None]:
torch.stack([torch.cat([xi[:bpi], e, xi[bpi:]]) for xi, bpi in zip(x, bp)])

In [None]:
x= torch.tensor([[6031, 6031, 6031, 6047],
        [6031, 6031, 6031, 6046],
        [6047, 6047, 6047, 6046],
        [6046, 6046, 6046, 6012],
        [6046, 6046, 6046, 6013],
        [6012, 6012, 6012, 6008],
        [6013, 6013, 6013, 6031],
        [6008, 6008, 6008, 6031],
        [6031, 6031, 6031, 6031],
        [6031, 6031, 6031, 6046],
        [   0, 6031, 6031, 6046],
        [   0,    0, 6046,    0]], device='cuda:0')

pred_loc= torch.tensor([[6031],
        [6033],
        [6031],
        [6031]], device='cuda:0')

In [None]:
x.shape

In [None]:
x_after = torch.tensor([[6031, 6031, 6031, 6047],
        [6031, 6031, 6031, 6046],
        [6047, 6047, 6047, 6046],
        [6046, 6046, 6046, 6012],
        [6046, 6046, 6046, 6013],
        [6012, 6012, 6012, 6008],
        [6013, 6013, 6013, 6031],
        [6008, 6008, 6008, 6031],
        [6031, 6031, 6031, 6031],
        [6031, 6031, 6031, 6046],
        [6031, 6031, 6031, 6046],
        [   0, 6033, 6046, 6031],
        [   0,    0, 6031,    0]], device='cuda:0')
x_len_before = torch.tensor([9, 10, 11, 10], device='cuda:0')
x_len_after = torch.tensor([11, 12, 13, 12], device='cuda:0')

In [None]:
x_after

In [None]:
gen = torch.stack([xi[x_len_beforei:x_len_afteri] for xi, x_len_beforei, x_len_afteri in zip(x_after.transpose(1, 0), x_len_before, x_len_after)])

In [None]:
gen

In [None]:
torch.cat([gen, gen], dim=0)

In [None]:
for xi, x_len_beforei, x_len_afteri in zip(x_after.transpose(1, 0), x_len_before, x_len_after):
    print(xi, x_len_beforei, x_len_afteri)
    print(xi[x_len_beforei:x_len_afteri])
    break

## Validation


In [None]:

# read and preprocess
sp = pd.read_csv(os.path.join(data_dir, "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(data_dir, "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

train_data, vali_data, test_data = _split_train_test(sp)

test_data = test_data.merge(
    loc.reset_index()[["id", "center"]].rename(columns={"id": "location_id"}), how="left", on="location_id"
)
test_data.rename(columns={"center": "geometry"}, inplace=True)
test_data["geometry"] = test_data["geometry"].apply(wkt.loads)
test_data = gpd.GeoDataFrame(test_data, geometry="geometry", crs="EPSG:4326")


In [None]:
test_data

In [None]:
metric = jump_length(test_data)

In [None]:
xlabel = "$\Delta r\,(m)$"
ylabel = "$P(\Delta r)$"
xmin = 1

In [None]:
metric = metric[metric>xmin]

In [None]:
# fit power law
fit = powerlaw.Fit(metric, xmin=xmin)

# plotting
powerlaw.plot_pdf(metric, label="data")
fit.power_law.plot_pdf(linestyle="--", label="powerlaw fit")
fit.truncated_power_law.plot_pdf(linestyle="--", label="truncated power law")
fit.lognormal.plot_pdf(linestyle="--", label="lognormal fit")

plt.legend(prop={"size": 13})
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)

plt.show()

In [None]:
simulated_sp = pd.read_csv(os.path.join(data_dir, "mobis_mhsa_generation.csv"))
simulated_sp

In [None]:
simulated_sp = simulated_sp.merge(
    loc.reset_index()[["id", "center"]].rename(columns={"id": "location_id"}), how="left", left_on="generated_ls", right_on="location_id"
)

In [None]:
simulated_sp.rename(columns={"center": "geometry"}, inplace=True)
simulated_sp["geometry"] = simulated_sp["geometry"].apply(wkt.loads)
simulated_sp = gpd.GeoDataFrame(simulated_sp, geometry="geometry", crs="EPSG:4326")

In [None]:
simulated_sp.drop(columns={"user_id", "generated_ls"}, inplace=True)
simulated_sp.rename(columns={"seq_id": "user_id"}, inplace=True)

In [None]:
simulated_jl = jump_length(simulated_sp)

In [None]:
xlabel = "$\Delta r\,(m)$"
ylabel = "$P(\Delta r)$"
xmin = 1
simulated_jl = simulated_jl[simulated_jl>xmin]

In [None]:
# fit power law
# fit = powerlaw.Fit(simulated_jl, xmin=xmin)

# plotting
powerlaw.plot_pdf(metric, label="data")
powerlaw.plot_pdf(simulated_jl, label="simulated")

# fit.power_law.plot_pdf(linestyle="--", label="powerlaw fit")
# fit.truncated_power_law.plot_pdf(linestyle="--", label="truncated power law")
# fit.lognormal.plot_pdf(linestyle="--", label="lognormal fit")

plt.legend(prop={"size": 13})
plt.xlabel(xlabel, fontsize=16)
plt.ylabel(ylabel, fontsize=16)

plt.show()

### Generate small dataset

In [None]:
sp = pd.read_csv(os.path.join(data_dir, "sp.csv"), index_col="id")
# loc = pd.read_csv(os.path.join(data_dir, "locs_s2.csv"), index_col="id")
# sp = load_data(sp, loc)

In [None]:
sp

In [None]:
selected_user = np.random.choice(sp["user_id"].unique(), 100, replace=False)

In [None]:
selected_user

In [None]:
sp.loc[sp["user_id"].isin(selected_user)].to_csv("./data/sp_small.csv")

## Calculate pairwise distances

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from trackintel.geogr.distances import calculate_distance_matrix
import pickle as pickle

In [None]:
all_locs = pd.read_csv("./data/all_locations.csv", index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [None]:
enc = OrdinalEncoder(dtype=np.int64, handle_unknown="use_encoded_value", unknown_value=-1).fit(
    all_locs["loc_id"].values.reshape(-1, 1)
)
# add 1 to account for 0 padding
all_locs["loc_id"] = enc.transform(all_locs["loc_id"].values.reshape(-1, 1)) + 1

In [None]:
all_locs

In [None]:
# %%time

# calculate_distance_matrix(all_locs, all_locs[:256], dist_metric="haversine", n_jobs=-1)
dist_matrix = calculate_distance_matrix(all_locs, dist_metric="haversine", n_jobs=-1)

In [None]:
dist_matrix.shape

In [None]:

save_pk_file("./data/temp/dist_matrix.pk", dist_matrix)

## Calculate empirical matrix

In [None]:
from loc_predict.models.markov import markov_transition_prob
from utils.utils import load_data
from utils.dataloader import get_train_test
from tqdm import tqdm

In [None]:
sp = pd.read_csv(os.path.join(".", "data", "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(".", "data", "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join(".", "data", "all_locations.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [None]:
train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

In [None]:
transit_df = train_data.groupby("user_id").apply(markov_transition_prob, n=1).reset_index().drop(columns="level_1")

In [None]:
emp_matrix = np.zeros((len(all_locs), len(all_locs)))

for pair in tqdm(transit_df[["loc_1", "toLoc"]].values):
    emp_matrix[pair[0], pair[1]] += 1

In [None]:
save_pk_file("./data/temp/emp_matrix.pk", emp_matrix)

# check metric calculation

In [None]:
from metrics.evaluations import Metric
from utils.utils import load_data, setup_seed, load_config, init_save_path
from easydict import EasyDict as edict
from utils.dataloader import get_train_test, _get_valid_sequence

import pandas as pd
import numpy as np
import geopandas as gpd
from shapely import wkt

In [None]:
# initialization
config = load_config("./config/movesim.yml")
config = edict(config)

# read and preprocess
sp = pd.read_csv(os.path.join(config.temp_save_root, "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(config.temp_save_root, "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join(config.temp_save_root, "all_locations.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")
# transform to projected coordinate systems
all_locs = all_locs.to_crs("EPSG:2056")

train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

config["total_loc_num"] = int(all_locs.loc_id.max() + 1)
config["total_user_num"] = int(train_data.user_id.max() + 1)

In [None]:
train_data["id"] = np.arange(len(train_data))
vali_data["id"] = np.arange(len(vali_data))
test_data["id"] = np.arange(len(test_data))

train_idx = _get_valid_sequence(train_data, print_progress=config.verbose, previous_day=config.previous_day)
vali_idx = _get_valid_sequence(vali_data, print_progress=config.verbose, previous_day=config.previous_day)
test_idx = _get_valid_sequence(test_data, print_progress=config.verbose, previous_day=config.previous_day)


In [None]:
metrics = Metric(config, locations=all_locs, input_data=vali_data, valid_start_end_idx=vali_idx)

In [None]:
train_seq = [train_data.iloc[idx[0] : idx[1]]["location_id"].values for idx in train_idx]

In [None]:
test_seq = [test_data.iloc[idx[0] : idx[1]]["location_id"].values for idx in test_idx]

In [None]:
jsds = metrics.get_individual_jsds(gene_data=test_seq)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

In [None]:
jsds = metrics.get_individual_jsds(gene_data=train_seq)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)