In [1]:
import torch 
import numpy as np

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

from shapely import wkt
from tqdm import tqdm
from scipy.spatial.distance import pdist, squareform

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
import gensim

import trackintel as ti
from trackintel.geogr.distances import calculate_distance_matrix

In [2]:
ti.__version__

'1.2.4'

In [3]:
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from loc_predict.models.markov import markov_transition_prob
from utils.utils import load_data
from utils.dataloader import get_train_test

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Pairwise distance matrix

In [205]:
all_locs = pd.read_csv("./data/test/all_locations.csv", index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [206]:
all_locs

Unnamed: 0_level_0,loc_id,level,geometry,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5152984577853095936,13,POINT (10.34358 46.62398),POLYGON ((10.347820052132302 46.61871685680636...
1,5152985557105639424,12,POINT (10.29508 46.64301),POLYGON ((10.303565033130825 46.63249007077394...
2,5152985677364723712,13,POINT (10.26355 46.64099),"POLYGON ((10.26780082511594 46.63572990632601,..."
3,5152985711724462080,13,POINT (10.26698 46.65043),POLYGON ((10.271227187809282 46.64517003239574...
4,5152988254345101312,13,POINT (10.44252 46.62467),POLYGON ((10.446753699334618 46.61940323409345...
...,...,...,...,...
8959,5159785125990367232,13,POINT (9.53321 47.34228),"POLYGON ((9.53761425405036 47.33697471681336, ..."
8960,5159785297789059072,13,POINT (9.55174 47.36047),POLYGON ((9.556149844349989 47.355166963815904...
8961,5159785486767620096,12,POINT (9.58027 47.35314),"POLYGON ((9.589080312384539 47.34252563756386,..."
8962,5159785761645527040,12,POINT (9.59788 47.33191),POLYGON ((9.606683428932946 47.321300042067506...


In [207]:
# %%time

# calculate_distance_matrix(all_locs, all_locs[:256], dist_metric="haversine", n_jobs=-1)
dist_matrix = calculate_distance_matrix(all_locs, dist_metric="haversine", n_jobs=-1)

In [208]:
dist_matrix.shape

(8964, 8964)

In [34]:

save_pk_file("./data/temp/dist_matrix_test.pk", dist_matrix)

# Empirical visit matrix

In [7]:
sp = pd.read_csv(os.path.join(".", "data", "sp.csv"), index_col="id")
loc = pd.read_csv(os.path.join(".", "data", "locs_s2.csv"), index_col="id")
sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join(".", "data", "test", "all_locations.csv"), index_col="id")
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [8]:
train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

In [9]:
transit_df = train_data.groupby("user_id").apply(markov_transition_prob, n=1).reset_index().drop(columns="level_1")

In [10]:
emp_matrix = np.zeros((len(all_locs), len(all_locs)))

for pair in tqdm(transit_df[["loc_1", "toLoc"]].values):
    emp_matrix[pair[0], pair[1]] += 1

100%|██████████| 179399/179399 [00:00<00:00, 1117262.49it/s]


In [11]:
emp_matrix.shape

(8964, 8964)

In [42]:
save_pk_file("./data/temp/emp_matrix_test.pk", emp_matrix)

# POI distribution
## Read location file

In [4]:
s2_loc = pd.read_csv(os.path.join("..", "data", "s2_loc_all_level10_14.csv"), index_col="id")

s2_loc = s2_loc.drop(columns="geometry").rename(columns={"area":"geometry"})
s2_loc["geometry"] = s2_loc["geometry"].apply(wkt.loads)

loc = gpd.GeoDataFrame(s2_loc, geometry="geometry", crs="EPSG:4326")
loc = loc.to_crs("EPSG:2056")

In [5]:
loc

Unnamed: 0_level_0,loc_id,level,freq,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5152981090339651584,10,1,"POLYGON ((2827640.256 1154866.983, 2829448.495..."
1,5152983289362907136,10,0,"POLYGON ((2820309.073 1155559.663, 2822108.462..."
2,5152984393169502208,14,0,"POLYGON ((2822108.462 1164010.057, 2822221.013..."
3,5152984401759436800,14,0,"POLYGON ((2822221.013 1164538.626, 2822333.574..."
4,5152984410349371392,14,0,"POLYGON ((2821762.543 1164582.028, 2821875.069..."
...,...,...,...,...
142570,5160199354111229952,14,0,"POLYGON ((2825351.144 1214617.359, 2825464.119..."
142571,5160199362701164544,14,0,"POLYGON ((2824890.046 1214661.465, 2825002.986..."
142572,5160199371291099136,14,0,"POLYGON ((2825002.986 1215194.812, 2825115.937..."
142573,5160199379881033728,14,0,"POLYGON ((2825464.119 1215150.697, 2825577.105..."


## Read POI file and preprocess
### Places of Worship

In [9]:
# get all the pofws, 1
pofw = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pofw_free_1.shp"))

# area pofw transformed into points
pofw_a = gpd.read_file(os.path.join("..","data", "poi", "ori", "gis_osm_pofw_a_free_1.shp"))
pofw_a["geometry"] = pofw_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

pofw = pd.concat([pofw, pofw_a])
pofw = pofw.drop_duplicates(subset="osm_id")

### Transport Infrastructure

In [10]:
# get all the transport
transport = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_transport_free_1.shp"))

# area transport transformed into points
transport_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_transport_a_free_1.shp"))
transport_a["geometry"] = transport_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

transport = pd.concat([transport, transport_a])
transport = transport.drop_duplicates(subset="osm_id")

### Natural Features 

In [11]:
# 
natural = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_natural_free_1.shp"))

# drop the trees: code = 4121
natural = natural.loc[natural["code"] != 4121]

# area natural transformed into points
natural_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_natural_a_free_1.shp"))
natural_a = natural_a.loc[natural_a["code"] != 4121]
natural_a["geometry"] = natural_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

natural = pd.concat([natural, natural_a])
natural = natural.drop_duplicates(subset="osm_id")

In [12]:
natural["code"].value_counts()

code
4111    7098
4103    1041
4141     539
4132     477
4101     421
4112     297
Name: count, dtype: int64

### POIs

In [13]:
# 
poi = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pois_free_1.shp"))

# area poi transformed into points
poi_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pois_a_free_1.shp"))
poi_a["geometry"] = poi_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

poi = pd.concat([poi, poi_a])
poi = poi.drop_duplicates(subset="osm_id")

In [14]:
poi["code"].value_counts()

code
2902    68205
2701    43136
2252    23546
2253    22747
2301    17357
        ...  
2530       14
2955       10
2736        9
2737        8
2954        3
Name: count, Length: 139, dtype: int64

### Traffic

In [18]:
# get the parking slots of traffic point file
traffic = gpd.read_file(os.path.join("..","data", "poi", "ori", "gis_osm_traffic_free_1.shp"))

# drop the crossing, turning_circle, street_lamp: code = 5204, 5207, 5209
traffic = traffic.loc[~traffic["code"].isin([5204, 5207, 5209])]

# area traffic transformed into points
traffic_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_traffic_a_free_1.shp"))
traffic_a["geometry"] = traffic_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

traffic = pd.concat([traffic, traffic_a])
traffic = traffic.drop_duplicates(subset="osm_id")

In [19]:
traffic["code"].value_counts()

code
5260    60568
5270    10432
5201     6064
5250     3621
5203     3029
5263     2436
5303     1234
5321     1138
5206      994
5332      582
5208      547
5302      462
5262      396
5301      318
5202      300
5311      166
5251       64
5331        9
Name: count, dtype: int64

### Building

In [44]:
buildings = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_buildings_a_free_1.shp"))

# all building get the same code 1500
buildings["geometry"] = buildings.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

In [45]:
buildings["code"].value_counts()

code
1500    2584180
Name: count, dtype: int64

### Combine all types of POIs

In [51]:
# concat all the pois
poi_gdf = pd.concat([pofw, transport, natural, poi, traffic, buildings])
poi_gdf.drop(columns={"type"}, inplace=True)

In [52]:
poi_gdf["code"].value_counts()

code
1500    2584180
2902      68205
5260      60568
2701      43136
5621      34700
         ...   
3301          7
3500          7
3106          6
2954          3
5652          2
Name: count, Length: 188, dtype: int64

## Assign to poi category

In [57]:
def _assign_category(df):
    # 2018 Hong: Life services, Office building/space, Other facilities, Medical/Education, Entertainment, Government, Residence communities, Financial services
    # 2021 Yin : Residential, Hotel/resort, Mixed-use, K–12 schools, University/college, Office/workplace, Services, Civic/religious, Shopping/retail, Recreation/entertainment, Transportation, Others

    ### osm code -> 2018 Hong -> 2021 Yin
    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    # health 21xx -> Medical/Education -> Services
    # leisure 22xx -> Entertainment -> Recreation/entertainment
    # catering 23xx -> Life services -> Residential
    # accommodation 24xx -> Entertainment -> Hotel/resort
    # shopping 25xx -> Life services -> Shopping/retail
    # money 26xx -> Financial services -> Services
    # tourism 27xx -> Entertainment -> Recreation/entertainment
    # pofw 3xxx -> Life services -> Civic/religious
    # natural 41xx -> Entertainment -> Recreation/entertainment
    # transport 56xx -> Other facilities -> Transportation
    # miscpoi 29xx -> Other facilities -> Others

    # note: miscpoi contains "bench" or "drinking_water" that might not reveal any landuse info

    # init
    df["category"] = "Unknown"

    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    df.loc[(df["code"] > 2000) & (df["code"] < 2100), "category"] = "Residential"
    df.loc[(df["code"] > 2080) & (df["code"] < 2090), "category"] = "Schools"

    # health 21xx -> Medical/Education -> Services
    df.loc[(df["code"] > 2100) & (df["code"] < 2200), "category"] = "Services"

    # leisure 22xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2200) & (df["code"] < 2300), "category"] = "Entertainment"

    # catering 23xx -> Life services -> Residential
    df.loc[(df["code"] > 2300) & (df["code"] < 2400), "category"] = "Residential"

    # accommodation 24xx -> Entertainment -> Hotel/resort
    df.loc[(df["code"] > 2400) & (df["code"] < 2500), "category"] = "Entertainment"

    # shopping 25xx -> Life services -> Shopping/retail
    df.loc[(df["code"] > 2500) & (df["code"] < 2600), "category"] = "Shopping"

    # money 26xx -> Financial services -> Services
    df.loc[(df["code"] > 2600) & (df["code"] < 2700), "category"] = "Services"

    # tourism 27xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2700) & (df["code"] < 2800), "category"] = "Entertainment"

    # miscpoi 29xx -> Other facilities -> Others
    df.loc[(df["code"] > 2900) & (df["code"] < 3000), "category"] = "Others"
    df.loc[(df["code"] == 1500), "category"] = "Others"

    # pofw 3xxx -> Life services -> Civic/religious
    df.loc[(df["code"] > 3000) & (df["code"] < 4000), "category"] = "Civic"

    # natural 41xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 4000) & (df["code"] < 5000), "category"] = "Entertainment"

    # transport 56xx -> Other facilities -> Transportation
    df.loc[(df["code"] > 5600) & (df["code"] < 5700), "category"] = "Transportation"
    # traffic 54xx -> Other facilities -> Transportation
    df.loc[(df["code"] > 5200) & (df["code"] < 5400), "category"] = "Transportation"

    # Unknown           2737932
    # Others             127119
    # Entertainment       93521
    # Shopping            48116
    # Residential         42271
    # Transportation      39290
    # Services             9010
    # Schools              2850
    # Civic                 765

    print(df["category"].value_counts())
    return df

poi_category_gdf = _assign_category(poi_gdf)

category
Others            2699673
Entertainment      149791
Transportation     131733
Residential         50254
Shopping            47979
Services             9554
Schools              9142
Civic                6464
Name: count, dtype: int64


In [59]:
len(poi_category_gdf), poi_category_gdf["code"].value_counts().sum()

(3104590, 3104590)

## Final cleaning and safe

In [61]:
# final cleaning
poi_category_gdf.drop(columns=["osm_id", "fclass"], inplace=True)
# reindex
poi_category_gdf.reset_index(drop=True, inplace=True)
poi_category_gdf.index.name = "id"
poi_category_gdf.reset_index(inplace=True)

# change the projection and save
poi_category_gdf =poi_category_gdf.to_crs("EPSG:2056")
poi_category_gdf.to_file(os.path.join("..", "data", "poi", "final_pois.shp"))

## Get location poi pair

In [62]:
# read poi file
poi = gpd.read_file(os.path.join("..", "data", "poi", "final_pois.shp"))
spatial_index = poi.sindex

In [63]:
len(poi), poi["code"].value_counts().sum()

(3104590, 3104590)

In [65]:
def _get_inside_pois(df, poi, spatial_index):
    """
    Given one extent (df), return the poi within this extent.
    spatial_index is obtained from poi.sindex to speed up the process.
    """
    possible_matches_index = list(spatial_index.intersection(df.bounds))
    possible_matches = poi.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.within(df)]["id"].values

    return precise_matches

# get the inside poi within each location
tqdm.pandas(desc="Generating poi within")
loc["poi_within"] = loc["geometry"].progress_apply(
    _get_inside_pois, poi=poi, spatial_index=spatial_index
)

Generating poi within: 100%|██████████████████████████████████████████████████| 142575/142575 [04:26<00:00, 535.51it/s]


In [66]:
loc

Unnamed: 0_level_0,loc_id,level,freq,geometry,poi_within
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5152981090339651584,10,1,"POLYGON ((2827640.256 1154866.983, 2829448.495...","[46461, 46466, 46467, 47482, 46465, 47487, 144..."
1,5152983289362907136,10,0,"POLYGON ((2820309.073 1155559.663, 2822108.462...","[46462, 46460, 45903, 46464, 47479, 47481, 464..."
2,5152984393169502208,14,0,"POLYGON ((2822108.462 1164010.057, 2822221.013...",[]
3,5152984401759436800,14,0,"POLYGON ((2822221.013 1164538.626, 2822333.574...",[]
4,5152984410349371392,14,0,"POLYGON ((2821762.543 1164582.028, 2821875.069...",[]
...,...,...,...,...,...
142570,5160199354111229952,14,0,"POLYGON ((2825351.144 1214617.359, 2825464.119...",[]
142571,5160199362701164544,14,0,"POLYGON ((2824890.046 1214661.465, 2825002.986...",[]
142572,5160199371291099136,14,0,"POLYGON ((2825002.986 1215194.812, 2825115.937...",[]
142573,5160199379881033728,14,0,"POLYGON ((2825464.119 1215150.697, 2825577.105...",[]


In [67]:
# cleaning and expanding to location_id-poi_id pair
loc_no_geo = loc.drop(columns="geometry")

# explode preserves nan - preserves locs with no poi
locs_poi = loc_no_geo.explode(column="poi_within")

In [68]:
# get the poi info from original poi df
locs_poi = locs_poi.merge(poi[["id", "category", "code"]], left_on="poi_within", right_on="id", how="left")
locs_poi.drop(columns=["id"], inplace=True)

In [70]:
locs_poi["code"].value_counts()

code
1500.0    2584175
2902.0      68205
5260.0      60568
2701.0      43132
5621.0      34700
           ...   
3500.0          7
3301.0          7
3106.0          6
2954.0          3
5652.0          2
Name: count, Length: 188, dtype: int64

In [71]:
# final cleaning
valid_pairs = locs_poi.dropna(subset=["poi_within"]).copy()
valid_pairs["code"] = valid_pairs["code"].astype(int).astype(str)

In [73]:
# some location does not have pois inside
len(valid_pairs["loc_id"].unique())

83947

## Get lda vector for locations

In [74]:
def _lda(df, categories=16):
    """Note: deal with the osm assigned "code" field."""
    texts = df.groupby("loc_id")["code"].apply(list).to_list()

    dct = Dictionary(texts)
    corpus = [dct.doc2bow(line) for line in texts]

    lda = LdaModel(corpus, num_topics=categories)
    vector = lda[corpus]

    # the lda array
    dense_ldavector = gensim.matutils.corpus2dense(vector, num_terms=categories).T
    # the index arr
    index_arr = df.groupby("loc_id", as_index=False).count()["loc_id"].values

    poiValues = pd.Series(list(dense_ldavector))
    poiValues.index = index_arr

    poiValues.name = "poiValues"
    poiValues.index.name = "loc_id"

    return poiValues.reset_index()

poiValues = _lda(valid_pairs, categories=16)

In [77]:
locs_poi = loc.merge(poiValues, on="loc_id", how="left")

In [79]:
# assign all 0's to non-poi locations
locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"] = locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"].apply(lambda x: np.zeros(16))

In [101]:
locs_poi["poiValues"] = locs_poi["poiValues"].apply(lambda x: np.array(x, dtype=np.float32))

In [81]:
locs_poi.drop(columns=["poi_within", "geometry"], inplace=True)

## Save loc poi file

In [104]:
locs_poi.to_csv(os.path.join("..", "data", "s2_loc_poi_level10_14.csv"), index=False)

In [126]:
with open(os.path.join("..", "data", "s2_loc_poi_level10_14.pk"), "wb") as handle:
    pickle.dump(locs_poi[["loc_id", "poiValues"]].to_dict("list"), handle, protocol=pickle.HIGHEST_PROTOCOL)

In [142]:


temp = pickle.load(open(os.path.join("..", "data", "s2_loc_poi_level10_14.pk"), "rb"))

temp = pd.DataFrame(temp)
# temp
# ["poiValues"].apply(eval)

np.stack(temp.iloc[10:14]["poiValues"].values)

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.03125001, 0.03125001, 0.53124994, 0.03125001, 0.03125   ,
        0.03125   , 0.03125   , 0.03125004, 0.03125   , 0.03125001,
        0.03125001, 0.03125   , 0.03125   , 0.03125   , 0.03125001,
        0.03125   ]], dtype=float32)

In [143]:
temp.iloc[10:14]["poiValues"]

10    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
11    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
12    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
13    [0.031250007, 0.03125001, 0.53124994, 0.031250...
Name: poiValues, dtype: object

## Calculate distance

In [186]:
poiValues = locs_poi.sort_index()["poiValues"].values
poiValues = np.array([np.array(xi) for xi in poiValues])

In [195]:
corr_distance = pdist(poiValues, 'correlation')
corr_distance = squareform(corr_distance)
# nan values receive the highest distance
corr_distance[np.isnan(corr_distance)] = np.nanmax(corr_distance)

In [197]:
corr_distance.shape

(8964, 8964)

## Save

In [198]:
save_pk_file("./data/temp/function_matrix_test.pk", corr_distance)