In [165]:
import torch 
import numpy as np

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

# from loc_predict.processing import _split_train_test
from shapely import wkt
from tqdm import tqdm
from scipy.spatial.distance import pdist, squareform

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
import gensim

import trackintel as ti
from trackintel.geogr.distances import calculate_distance_matrix

from loc_predict.models.markov import markov_transition_prob
from utils.utils import load_data
from utils.dataloader import get_train_test

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Pairwise distance matrix

In [205]:
all_locs = pd.read_csv("./data/test/all_locations.csv", index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [206]:
all_locs

Unnamed: 0_level_0,loc_id,level,geometry,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5152984577853095936,13,POINT (10.34358 46.62398),POLYGON ((10.347820052132302 46.61871685680636...
1,5152985557105639424,12,POINT (10.29508 46.64301),POLYGON ((10.303565033130825 46.63249007077394...
2,5152985677364723712,13,POINT (10.26355 46.64099),"POLYGON ((10.26780082511594 46.63572990632601,..."
3,5152985711724462080,13,POINT (10.26698 46.65043),POLYGON ((10.271227187809282 46.64517003239574...
4,5152988254345101312,13,POINT (10.44252 46.62467),POLYGON ((10.446753699334618 46.61940323409345...
...,...,...,...,...
8959,5159785125990367232,13,POINT (9.53321 47.34228),"POLYGON ((9.53761425405036 47.33697471681336, ..."
8960,5159785297789059072,13,POINT (9.55174 47.36047),POLYGON ((9.556149844349989 47.355166963815904...
8961,5159785486767620096,12,POINT (9.58027 47.35314),"POLYGON ((9.589080312384539 47.34252563756386,..."
8962,5159785761645527040,12,POINT (9.59788 47.33191),POLYGON ((9.606683428932946 47.321300042067506...


In [207]:
# %%time

# calculate_distance_matrix(all_locs, all_locs[:256], dist_metric="haversine", n_jobs=-1)
dist_matrix = calculate_distance_matrix(all_locs, dist_metric="haversine", n_jobs=-1)

In [208]:
dist_matrix.shape

(8964, 8964)

In [34]:

save_pk_file("./data/temp/dist_matrix_test.pk", dist_matrix)

# Empirical visit matrix

In [7]:
sp = pd.read_csv(os.path.join(".", "data", "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(".", "data", "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join(".", "data", "test", "all_locations.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [8]:
train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

In [9]:
transit_df = train_data.groupby("user_id").apply(markov_transition_prob, n=1).reset_index().drop(columns="level_1")

In [10]:
emp_matrix = np.zeros((len(all_locs), len(all_locs)))

for pair in tqdm(transit_df[["loc_1", "toLoc"]].values):
    emp_matrix[pair[0], pair[1]] += 1

100%|██████████| 179399/179399 [00:00<00:00, 1117262.49it/s]


In [11]:
emp_matrix.shape

(8964, 8964)

In [42]:
save_pk_file("./data/temp/emp_matrix_test.pk", emp_matrix)

# POI distribution
## Read location file

In [68]:
all_locs = pd.read_csv("./data/test/all_locations.csv", index_col="id")
all_locs = all_locs.drop(columns="geometry")
all_locs["area"] = all_locs["area"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="area", crs="EPSG:4326")
all_locs = all_locs.to_crs("EPSG:2056")

## Read POI file and preprocess

In [60]:
def _read_poi_files():
    # get all the pofws, 1
    pofw_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_pofw_free_1.shp"))

    # get all the transport, 4
    transport_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_transport_free_1.shp"))

    # drop the trees and get the beaches, 1
    natural_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_natural_free_1.shp"))
    natural_file = natural_file.loc[natural_file["fclass"] != "tree"]

    natural_a_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_natural_a_free_1.shp"))
    natural_a_file = natural_a_file.loc[natural_a_file["fclass"].isin(["beach"])].reset_index(drop=True)
    natural_a_file["geometry"] = natural_a_file.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

    # get all the pois, 11
    poi_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_pois_free_1.shp"))

    # get the parking slots of traffic point file, 4
    traffic_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_traffic_free_1.shp"))
    traffic_file = (
        traffic_file.loc[
            traffic_file["fclass"].isin(
                [
                    "parking",
                    "parking_bicycle",
                    "parking_underground",
                    "parking_multistorey",
                ]
            )
        ]
        .reset_index(drop=True)
        .copy()
    )

    # get the parking slots of traffic area file
    traffic_a_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_traffic_a_free_1.shp"))
    traffic_a_file = (
        traffic_a_file.loc[
            traffic_a_file["fclass"].isin(
                [
                    "parking",
                    "parking_bicycle",
                    "parking_underground",
                    "parking_multistorey",
                ]
            )
        ]
        .reset_index(drop=True)
        .copy()
    )
    traffic_a_file["geometry"] = traffic_a_file.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

    # buildings_file
    buildings_file = gpd.read_file(os.path.join("data", "poi", "ori", "gis_osm_buildings_a_free_1.shp"))
    buildings_file["code"] = buildings_file.groupby(["type"]).ngroup() + 1002
    buildings_file.loc[buildings_file["code"].isna(), "code"] = 1001
    buildings_file["geometry"] = buildings_file.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

    # concat all the pois
    all_pois = pd.concat(
        [
            pofw_file,
            transport_file,
            natural_file,
            natural_a_file,
            poi_file,
            traffic_file,
            traffic_a_file,
            buildings_file,
        ]
    )
    # all_pois.drop(columns=["name"], inplace=True)

    return all_pois

gdf = _read_poi_files()

In [61]:
def _assign_category(df):
    # 2018 Hong: Life services, Office building/space, Other facilities, Medical/Education, Entertainment, Government, Residence communities, Financial services
    # 2021 Yin : Residential, Hotel/resort, Mixed-use, K–12 schools, University/college, Office/workplace, Services, Civic/religious, Shopping/retail, Recreation/entertainment, Transportation, Others

    ### osm code -> 2018 Hong -> 2021 Yin
    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    # health 21xx -> Medical/Education -> Services
    # leisure 22xx -> Entertainment -> Recreation/entertainment
    # catering 23xx -> Life services -> Residential
    # accommodation 24xx -> Entertainment -> Hotel/resort
    # shopping 25xx -> Life services -> Shopping/retail
    # money 26xx -> Financial services -> Services
    # tourism 27xx -> Entertainment -> Recreation/entertainment
    # pofw 3xxx -> Life services -> Civic/religious
    # natural 41xx -> Entertainment -> Recreation/entertainment
    # transport 56xx -> Other facilities -> Transportation
    # miscpoi 29xx -> Other facilities -> Others

    # note: miscpoi contains "bench" or "drinking_water" that might not reveal any landuse info

    # init
    df["category"] = "Unknown"

    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    df.loc[(df["code"] > 2000) & (df["code"] < 2100), "category"] = "Residential"
    df.loc[(df["code"] > 2080) & (df["code"] < 2090), "category"] = "Schools"

    # health 21xx -> Medical/Education -> Services
    df.loc[(df["code"] > 2100) & (df["code"] < 2200), "category"] = "Services"

    # leisure 22xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2200) & (df["code"] < 2300), "category"] = "Entertainment"

    # catering 23xx -> Life services -> Residential
    df.loc[(df["code"] > 2300) & (df["code"] < 2400), "category"] = "Residential"

    # accommodation 24xx -> Entertainment -> Hotel/resort
    df.loc[(df["code"] > 2400) & (df["code"] < 2500), "category"] = "Entertainment"

    # shopping 25xx -> Life services -> Shopping/retail
    df.loc[(df["code"] > 2500) & (df["code"] < 2600), "category"] = "Shopping"

    # money 26xx -> Financial services -> Services
    df.loc[(df["code"] > 2600) & (df["code"] < 2700), "category"] = "Services"

    # tourism 27xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2700) & (df["code"] < 2800), "category"] = "Entertainment"

    # miscpoi 29xx -> Other facilities -> Others
    df.loc[(df["code"] > 2900) & (df["code"] < 3000), "category"] = "Others"

    # pofw 3xxx -> Life services -> Civic/religious
    df.loc[(df["code"] > 3000) & (df["code"] < 4000), "category"] = "Civic"

    # natural 41xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 4000) & (df["code"] < 5000), "category"] = "Entertainment"

    # transport 56xx -> Other facilities -> Transportation
    df.loc[(df["code"] > 5600) & (df["code"] < 5700), "category"] = "Transportation"

    # Unknown           2737932
    # Others             127119
    # Entertainment       93521
    # Shopping            48116
    # Residential         42271
    # Transportation      39290
    # Services             9010
    # Schools              2850
    # Civic                 765

    print(df["category"].value_counts())
    return df

gdf = _assign_category(gdf)

category
Unknown           2737932
Others             127119
Entertainment       93521
Shopping            48116
Residential         42271
Transportation      39290
Services             9010
Schools              2850
Civic                 765
Name: count, dtype: int64


In [62]:
len(gdf), gdf["code"].value_counts().sum()

(3100874, 3100874)

In [63]:
# final cleaning
gdf.drop(columns=["osm_id", "fclass"], inplace=True)
# reindex
gdf.reset_index(drop=True, inplace=True)
gdf.index.name = "id"
gdf.reset_index(inplace=True)

# change the projection and save
gdf = gdf.to_crs("EPSG:2056")
gdf.to_file(os.path.join("data", "poi", "final_pois.shp"))

## Get location poi pair

In [64]:
# read poi file
poi = gpd.read_file(os.path.join("data", "poi", "final_pois.shp"))
spatial_index = poi.sindex

In [65]:
poi["code"].value_counts().sum()

3100874

In [70]:
def _get_inside_pois(df, poi, spatial_index):
    """
    Given one extent (df), return the poi within this extent.
    spatial_index is obtained from poi.sindex to speed up the process.
    """
    possible_matches_index = list(spatial_index.intersection(df.bounds))
    possible_matches = poi.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.within(df)]["id"].values

    return precise_matches

# get the inside poi within each location
tqdm.pandas(desc="Generating poi within")
all_locs["poi_within"] = all_locs["area"].progress_apply(
    _get_inside_pois, poi=poi, spatial_index=spatial_index
)

Generating poi within: 100%|██████████| 8964/8964 [00:54<00:00, 163.65it/s]


In [71]:
all_locs

Unnamed: 0_level_0,loc_id,level,area,poi_within
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5152984577853095936,13,"POLYGON ((2822783.921 1167182.211, 2823009.157...","[1321249, 1321313, 1321341, 1321325, 1321343, ..."
1,5152985557105639424,12,"POLYGON ((2819340.510 1168587.568, 2819790.000...","[322841, 42965, 121298, 121299, 1404046, 16777..."
2,5152985677364723712,13,"POLYGON ((2816590.595 1168848.114, 2816814.904...","[95060, 1483539, 232216, 2886149, 95061, 99850..."
3,5152985711724462080,13,"POLYGON ((2816814.904 1169906.317, 2817039.255...","[408769, 49439, 390867, 51731, 998508, 95063, ..."
4,5152988254345101312,13,"POLYGON ((2830353.619 1167544.210, 2830580.004...","[1787820, 1787829, 1787830, 1787837, 1787827, ..."
...,...,...,...,...
8959,5159785125990367232,13,"POLYGON ((2758620.178 1245023.063, 2758836.180...","[3044777, 2334529, 189401, 2190426, 2206291, 2..."
8960,5159785297789059072,13,"POLYGON ((2759965.980 1247082.609, 2760182.201...","[1362222, 1362200, 1362221, 1362198, 16652, 13..."
8961,5159785486767620096,12,"POLYGON ((2762491.584 1245745.332, 2762924.822...","[680729, 680728, 680730, 2409037, 332162, 6807..."
8962,5159785761645527040,12,"POLYGON ((2763886.581 1243422.966, 2764320.214...","[89060, 329644, 314974, 763044, 444398, 151767..."


In [72]:
# cleaning and expanding to location_id-poi_id pair
all_locs.drop(columns="area", inplace=True)

# explode preserves nan - preserves locs with no poi
locs_poi = all_locs.explode(column="poi_within")

In [74]:
# get the poi info from original poi df
locs_poi = locs_poi.merge(poi[["id", "category", "code"]], left_on="poi_within", right_on="id", how="left")
locs_poi.drop(columns=["id"], inplace=True)

In [75]:
locs_poi

Unnamed: 0,loc_id,level,poi_within,category,code
0,5152984577853095936,13,1321249,Unknown,1001.0
1,5152984577853095936,13,1321313,Unknown,1001.0
2,5152984577853095936,13,1321341,Unknown,1001.0
3,5152984577853095936,13,1321325,Unknown,1001.0
4,5152984577853095936,13,1321343,Unknown,1001.0
...,...,...,...,...,...
2368642,5160198645441626112,13,907772,Unknown,1001.0
2368643,5160198645441626112,13,907659,Unknown,1001.0
2368644,5160198645441626112,13,907855,Unknown,1001.0
2368645,5160198645441626112,13,38599,Transportation,5655.0


In [76]:
# final cleaning
valid_pairs = locs_poi.dropna(subset=["poi_within"]).copy()
valid_pairs["code"] = valid_pairs["code"].astype(int).astype(str)

In [79]:
# some location does not have pois inside
len(valid_pairs["loc_id"].unique())

8919

## Get lda vector for each location

In [119]:
def _lda(df, categories=16):
    """Note: deal with the osm assigned "code" field."""
    texts = df.groupby("loc_id")["code"].apply(list).to_list()

    dct = Dictionary(texts)
    corpus = [dct.doc2bow(line) for line in texts]

    lda = LdaModel(corpus, num_topics=categories)
    vector = lda[corpus]

    # the lda array
    dense_ldavector = gensim.matutils.corpus2dense(vector, num_terms=categories).T
    # the index arr
    index_arr = df.groupby("loc_id", as_index=False).count()["loc_id"].values

    poiValues = pd.Series(list(dense_ldavector))
    poiValues.index = index_arr

    poiValues.name = "poiValues"
    poiValues.index.name = "loc_id"

    return poiValues.reset_index()

poiValues = _lda(valid_pairs, categories=16)

In [127]:
locs_poi = all_locs.merge(poiValues, on="loc_id", how="left")

In [139]:
# assign all 0's to non-poi locations
locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"] = locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"].apply(lambda x: np.zeros(16))

In [144]:
locs_poi.drop(columns="poi_within", inplace=True)

## Calculate distance

In [186]:
poiValues = locs_poi.sort_index()["poiValues"].values
poiValues = np.array([np.array(xi) for xi in poiValues])

In [195]:
corr_distance = pdist(poiValues, 'correlation')
corr_distance = squareform(corr_distance)
# nan values receive the highest distance
corr_distance[np.isnan(corr_distance)] = np.nanmax(corr_distance)

In [197]:
corr_distance.shape

(8964, 8964)

## Save

In [198]:
save_pk_file("./data/temp/function_matrix_test.pk", corr_distance)