In [1]:
import torch 
import numpy as np

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

# from loc_predict.processing import _split_train_test

from shapely import wkt

import powerlaw
import matplotlib.pyplot as plt

# from metrics.metrics import radius_gyration, jump_length, location_frquency, wait_time
from utils.utils import load_data

data_dir = os.path.join("data", "validation")

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Validation


In [None]:
simulated_sp = pd.read_csv(os.path.join(data_dir, "mobis_mhsa_generation.csv"))
simulated_sp

In [None]:
simulated_sp = simulated_sp.merge(
    loc.reset_index()[["id", "center"]].rename(columns={"id": "location_id"}), how="left", left_on="generated_ls", right_on="location_id"
)

In [None]:
simulated_sp.rename(columns={"center": "geometry"}, inplace=True)
simulated_sp["geometry"] = simulated_sp["geometry"].apply(wkt.loads)
simulated_sp = gpd.GeoDataFrame(simulated_sp, geometry="geometry", crs="EPSG:4326")

In [None]:
simulated_sp.drop(columns={"user_id", "generated_ls"}, inplace=True)
simulated_sp.rename(columns={"seq_id": "user_id"}, inplace=True)

# Validate metric calculation

In [2]:
from metrics.evaluations import Metric
from utils.utils import load_data, setup_seed, load_config
from easydict import EasyDict as edict
# from utils.dataloader import get_train_test, _get_valid_sequence

import pandas as pd
import numpy as np
import os
import geopandas as gpd
from shapely import wkt

In [52]:
# initialization
config = load_config("./config/movesim.yml")
config = edict(config)

# read and preprocess
sp = pd.read_csv(os.path.join(config.temp_save_root, "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(config.temp_save_root, "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join(config.temp_save_root, "all_locations.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")
# transform to projected coordinate systems
all_locs = all_locs.to_crs("EPSG:2056")

train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

config["total_loc_num"] = int(all_locs.loc_id.max() + 1)
config["total_user_num"] = int(train_data.user_id.max() + 1)

In [53]:
train_data["id"] = np.arange(len(train_data))
vali_data["id"] = np.arange(len(vali_data))
test_data["id"] = np.arange(len(test_data))

train_idx = _get_valid_sequence(train_data, print_progress=config.verbose, previous_day=config.previous_day)
vali_idx = _get_valid_sequence(vali_data, print_progress=config.verbose, previous_day=config.previous_day)
test_idx = _get_valid_sequence(test_data, print_progress=config.verbose, previous_day=config.previous_day)


100%|██████████| 2094/2094 [00:41<00:00, 50.11it/s]
100%|██████████| 2088/2088 [00:11<00:00, 188.71it/s]
100%|██████████| 2094/2094 [00:10<00:00, 199.15it/s]


In [5]:
temp_data = train_data.copy()
temp_data.set_index("id", inplace=True)

train_seq = [temp_data.iloc[idx[0] : idx[1]]["location_id"].values for idx in train_idx]

In [6]:
temp_data = test_data.copy()
temp_data.set_index("id", inplace=True)

test_seq = [temp_data.iloc[idx[0] : idx[1]]["location_id"].values for idx in test_idx]

In [7]:
metrics = Metric(config, locations=all_locs, input_data=vali_data, valid_start_end_idx=vali_idx)

In [8]:
jsds = metrics.get_individual_jsds(gene_data=test_seq)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.091, rg 0.081, period 0.024, topk all 0.005, topk 0.000


In [9]:
jsds = metrics.get_individual_jsds(gene_data=train_seq)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.092, rg 0.071, period 0.021, topk all 0.000, topk 0.288


## Diffusion sequence

In [3]:
from sklearn.preprocessing import OrdinalEncoder
from metrics.evaluations import Metric

In [4]:
# initialization
config = load_config("./config/diff.yml")
config = edict(config)

In [12]:
path = f"{config.data_dir}/test_level{config.level}_{config.src_min_days}_{config.tgt_min_days}_tiny.pk"
sequence_ls = pickle.load(open(path, "rb"))

tgt_tiny = []
src_tiny = []
for record in sequence_ls:
    tgt_tiny.append(record["tgt"])
    src_tiny.append(record["src"])

In [6]:
path = f"{config.data_dir}/train_level{config.level}_{config.src_min_days}_{config.tgt_min_days}_tiny.pk"
sequence_ls = pickle.load(open(path, "rb"))

train_tiny = []
for record in sequence_ls:
    train_tiny.append(record["tgt"])

In [7]:
all_locs = pd.read_csv(os.path.join("data", "s2_loc_visited_level10_13.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")
# transform to projected coordinate systems
all_locs = all_locs.to_crs("EPSG:2056")

In [8]:
enc = OrdinalEncoder(dtype=np.int64, handle_unknown="use_encoded_value", unknown_value=-1).fit(
    all_locs["loc_id"].values.reshape(-1, 1)
)
all_locs["loc_id"] = enc.transform(all_locs["loc_id"].values.reshape(-1, 1)) + 2

In [9]:
all_locs

Unnamed: 0_level_0,loc_id,level,geometry,freq,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2,10,POINT (2824875.045 1159436.528),1,POLYGON ((10.405097146225662 46.50637570678295...
1,3,13,POINT (2822437.857 1167754.527),38,POLYGON ((10.347820052132302 46.61871685680636...
2,4,13,POINT (2815796.126 1167304.569),1,POLYGON ((10.260953592346382 46.61685460820135...
3,5,13,POINT (2816020.303 1168362.492),1,POLYGON ((10.264376293749178 46.62629143145975...
4,6,13,POINT (2818994.419 1169160.037),1,POLYGON ((10.303565033130825 46.63249007077394...
...,...,...,...,...,...
14876,14878,13,POINT (2767507.842 1213073.682),1,"POLYGON ((9.64763962539829 47.042237657717536,..."
14877,14879,13,POINT (2768419.907 1212986.551),1,"POLYGON ((9.65960383635977 47.04122268391671, ..."
14878,14880,13,POINT (2769332.126 1212899.406),1,"POLYGON ((9.671569607102107 47.04020629289218,..."
14879,14881,13,POINT (2770244.499 1212812.246),1,"POLYGON ((9.68353693615797 47.039188484166594,..."


In [10]:
metrics = Metric(max_locs=config.max_location, locations=all_locs, reference_data=train_tiny)

In [11]:
jsds = metrics.get_individual_jsds(gene_data=tgt_tiny)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.252, rg 0.237, period 0.227, topk all 0.023, topk 0.790


In [13]:
jsds = metrics.get_individual_jsds(gene_data=src_tiny)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.246, rg 0.283, period 0.333, topk all 0.027, topk 0.833


In [14]:
metrics = Metric(max_locs=config.max_location, locations=all_locs, reference_data=src_tiny)

In [15]:
jsds = metrics.get_individual_jsds(gene_data=tgt_tiny)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.069, rg 0.294, period 0.300, topk all 0.018, topk 0.745


In [81]:
import json

predict_ls = []
true_ls = []
src_ls = []
with open(".\\runs\mobis_diffseq_evaluate\seed101_step500.json", "r") as f_reader:
    for row in f_reader:
        content = json.loads(row)

        src_ls.append((np.array(content["source"])[:-1]))

        reference_arr = np.array(content["reference"])
        try:
            reference_arr = reference_arr[:np.where(reference_arr == 0)[0][0]]
        except IndexError:
            reference_arr = reference_arr
        true_ls.append(reference_arr)

        if len(content["recover"]) >= 50:
            predict_ls.append(np.array(content["recover"][:50]).squeeze())
        else:
            print(len(content["recover"]))
        

18
44
41
47
46


In [83]:
metrics = Metric(max_locs=config.max_location, locations=all_locs, reference_data=src_ls)

In [85]:
jsds = metrics.get_individual_jsds(gene_data=true_ls)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.046, rg 0.203, period 0.281, topk all 0.016, topk 0.745


In [86]:
jsds = metrics.get_individual_jsds(gene_data=predict_ls)
print(
    "Metric: distance {:.3f}, rg {:.3f}, period {:.3f}, topk all {:.3f}, topk {:.3f}".format(
        jsds[0], jsds[1], jsds[2], jsds[3], jsds[4]
    )
)

Metric: distance 0.394, rg 0.462, period 0.804, topk all 0.030, topk 0.833
