In [1]:
import torch 
import numpy as np

import math

import os
import pandas as pd
import geopandas as gpd
import pickle as pickle

import shapely
from shapely import wkt
from tqdm import tqdm

from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LdaModel
import gensim

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances

import trackintel as ti
from trackintel.geogr.distances import calculate_distance_matrix

In [2]:
ti.__version__

'1.2.4'

In [5]:
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from loc_predict.models.markov import markov_transition_prob
from utils.utils import load_data
# from utils.dataloader import get_train_test

def save_pk_file(save_path, data):
    """Function to save data to pickle format given data and path."""
    with open(save_path, "wb") as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

# Pairwise distance matrix

In [14]:
visited_locs = pd.read_csv("../data/s2_loc_visited_level10_13.csv", index_col="id").sort_values(by="loc_id")
visited_locs["geometry"] = visited_locs["geometry"].apply(wkt.loads)
visited_locs = gpd.GeoDataFrame(visited_locs, geometry="geometry", crs="EPSG:4326")

visited_locs = visited_locs.to_crs("EPSG:2056")

In [15]:
visited_locs.head()

Unnamed: 0_level_0,loc_id,level,geometry,freq,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5152981090339651584,10,POINT (2824875.045 1159436.528),1,POLYGON ((10.405097146225662 46.50637570678295...
1,5152984577853095936,13,POINT (2822437.857 1167754.527),38,POLYGON ((10.347820052132302 46.61871685680636...
2,5152985265047863296,13,POINT (2815796.126 1167304.569),1,POLYGON ((10.260953592346382 46.61685460820135...
3,5152985299407601664,13,POINT (2816020.303 1168362.492),1,POLYGON ((10.264376293749178 46.62629143145975...
4,5152985505566031872,13,POINT (2818994.419 1169160.037),1,POLYGON ((10.303565033130825 46.63249007077394...


In [16]:
def calculate_distance_matrix(X):
    X = shapely.get_coordinates(X.geometry)

    distance = pdist(X, 'euclidean')
    dist_matrix = squareform(distance)
    return dist_matrix
    
dist_matrix = calculate_distance_matrix(visited_locs)

In [17]:
dist_matrix.shape

(14881, 14881)

In [18]:
save_pk_file("../data/matrix/distance_13.pk", dist_matrix)

# Empirical visit matrix

In [19]:
sp = pd.read_csv(os.path.join("..", "data", "sp_all.csv"), index_col="id")
loc = pd.read_csv(os.path.join("..", "data", "loc_s2_level10_13.csv"), index_col="id")

sp = load_data(sp, loc)

all_locs = pd.read_csv(os.path.join("..", "data", "s2_loc_visited_level10_13.csv"))
all_locs["geometry"] = all_locs["geometry"].apply(wkt.loads)
all_locs = gpd.GeoDataFrame(all_locs, geometry="geometry", crs="EPSG:4326")

In [20]:
train_data, vali_data, test_data, all_locs = get_train_test(sp, all_locs=all_locs)

In [21]:
transit_df = train_data.groupby("user_id").apply(markov_transition_prob, n=1).reset_index().drop(columns="level_1")

In [22]:
emp_matrix = np.zeros((len(all_locs), len(all_locs)))

for pair in tqdm(transit_df[["loc_1", "toLoc"]].values):
    emp_matrix[pair[0], pair[1]] += 1

100%|██████████████████████████████████████████████████████████████████████| 179486/179486 [00:00<00:00, 873595.12it/s]


In [23]:
emp_matrix.shape

(14881, 14881)

In [24]:
save_pk_file("../data/matrix/visits_13.pk", emp_matrix)


# POI distribution
## Read location file

In [7]:
s2_loc = pd.read_csv(os.path.join("..", "data", "s2_loc_all_level10_13.csv"), index_col="id")

s2_loc = s2_loc.drop(columns="geometry").rename(columns={"area":"geometry"})
s2_loc["geometry"] = s2_loc["geometry"].apply(wkt.loads)

loc = gpd.GeoDataFrame(s2_loc, geometry="geometry", crs="EPSG:4326")
loc = loc.to_crs("EPSG:2056")

In [8]:
loc

Unnamed: 0_level_0,loc_id,level,freq,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5152981090339651584,10,1,"POLYGON ((2827640.256 1154866.983, 2829448.495..."
1,5152983289362907136,10,0,"POLYGON ((2820309.073 1155559.663, 2822108.462..."
2,5152984406054404096,13,0,"POLYGON ((2822108.462 1164010.057, 2822333.574..."
3,5152984440414142464,13,0,"POLYGON ((2821191.630 1164096.842, 2821416.603..."
4,5152984474773880832,13,0,"POLYGON ((2821416.603 1165154.061, 2821641.617..."
...,...,...,...,...
39172,5160199263916916736,13,0,"POLYGON ((2824428.986 1214705.568, 2824654.807..."
39173,5160199298276655104,13,0,"POLYGON ((2824203.206 1213639.007, 2824428.986..."
39174,5160199332636393472,13,0,"POLYGON ((2825125.225 1213550.831, 2825351.144..."
39175,5160199366996131840,13,0,"POLYGON ((2825351.144 1214617.359, 2825577.105..."


## Read POI file and preprocess
### Places of Worship

In [27]:
# get all the pofws, 1
pofw = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pofw_free_1.shp"))

# area pofw transformed into points
pofw_a = gpd.read_file(os.path.join("..","data", "poi", "ori", "gis_osm_pofw_a_free_1.shp"))
pofw_a["geometry"] = pofw_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

pofw = pd.concat([pofw, pofw_a])
pofw = pofw.drop_duplicates(subset="osm_id")

### Transport Infrastructure

In [28]:
# get all the transport
transport = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_transport_free_1.shp"))

# area transport transformed into points
transport_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_transport_a_free_1.shp"))
transport_a["geometry"] = transport_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

transport = pd.concat([transport, transport_a])
transport = transport.drop_duplicates(subset="osm_id")

### Natural Features 

In [29]:
# 
natural = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_natural_free_1.shp"))

# drop the trees: code = 4121
natural = natural.loc[natural["code"] != 4121]

# area natural transformed into points
natural_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_natural_a_free_1.shp"))
natural_a = natural_a.loc[natural_a["code"] != 4121]
natural_a["geometry"] = natural_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

natural = pd.concat([natural, natural_a])
natural = natural.drop_duplicates(subset="osm_id")

In [30]:
natural["code"].value_counts()

code
4111    7098
4103    1041
4141     539
4132     477
4101     421
4112     297
Name: count, dtype: int64

### POIs

In [31]:
# 
poi = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pois_free_1.shp"))

# area poi transformed into points
poi_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_pois_a_free_1.shp"))
poi_a["geometry"] = poi_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

poi = pd.concat([poi, poi_a])
poi = poi.drop_duplicates(subset="osm_id")

In [32]:
poi["code"].value_counts()

code
2902    68205
2701    43136
2252    23546
2253    22747
2301    17357
        ...  
2530       14
2955       10
2736        9
2737        8
2954        3
Name: count, Length: 139, dtype: int64

### Traffic

In [33]:
# get the parking slots of traffic point file
traffic = gpd.read_file(os.path.join("..","data", "poi", "ori", "gis_osm_traffic_free_1.shp"))

# drop the crossing, turning_circle, street_lamp: code = 5204, 5207, 5209
traffic = traffic.loc[~traffic["code"].isin([5204, 5207, 5209])]

# area traffic transformed into points
traffic_a = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_traffic_a_free_1.shp"))
traffic_a["geometry"] = traffic_a.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

traffic = pd.concat([traffic, traffic_a])
traffic = traffic.drop_duplicates(subset="osm_id")

In [34]:
traffic["code"].value_counts()

code
5260    60568
5270    10432
5201     6064
5250     3621
5203     3029
5263     2436
5303     1234
5321     1138
5206      994
5332      582
5208      547
5302      462
5262      396
5301      318
5202      300
5311      166
5251       64
5331        9
Name: count, dtype: int64

### Building

In [35]:
buildings = gpd.read_file(os.path.join("..", "data", "poi", "ori", "gis_osm_buildings_a_free_1.shp"))

# all building get the same code 1500
buildings["geometry"] = buildings.to_crs("EPSG:2056").geometry.centroid.to_crs("EPSG:4326")

In [36]:
buildings["code"].value_counts()

code
1500    2584180
Name: count, dtype: int64

### Combine all types of POIs

In [37]:
# concat all the pois
poi_gdf = pd.concat([pofw, transport, natural, poi, traffic, buildings])
poi_gdf.drop(columns={"type"}, inplace=True)

In [38]:
poi_gdf["code"].value_counts()

code
1500    2584180
2902      68205
5260      60568
2701      43136
5621      34700
         ...   
3301          7
3500          7
3106          6
2954          3
5652          2
Name: count, Length: 188, dtype: int64

## Assign to poi category

In [39]:
def _assign_category(df):
    # 2018 Hong: Life services, Office building/space, Other facilities, Medical/Education, Entertainment, Government, Residence communities, Financial services
    # 2021 Yin : Residential, Hotel/resort, Mixed-use, K–12 schools, University/college, Office/workplace, Services, Civic/religious, Shopping/retail, Recreation/entertainment, Transportation, Others

    ### osm code -> 2018 Hong -> 2021 Yin
    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    # health 21xx -> Medical/Education -> Services
    # leisure 22xx -> Entertainment -> Recreation/entertainment
    # catering 23xx -> Life services -> Residential
    # accommodation 24xx -> Entertainment -> Hotel/resort
    # shopping 25xx -> Life services -> Shopping/retail
    # money 26xx -> Financial services -> Services
    # tourism 27xx -> Entertainment -> Recreation/entertainment
    # pofw 3xxx -> Life services -> Civic/religious
    # natural 41xx -> Entertainment -> Recreation/entertainment
    # transport 56xx -> Other facilities -> Transportation
    # miscpoi 29xx -> Other facilities -> Others

    # note: miscpoi contains "bench" or "drinking_water" that might not reveal any landuse info

    # init
    df["category"] = "Unknown"

    # public 20xx  -> Residence communities ->  Residential
    #    university + school + kindergarten + college (208x) -> Medical/Education -> K–12 schools/University/college
    df.loc[(df["code"] > 2000) & (df["code"] < 2100), "category"] = "Residential"
    df.loc[(df["code"] > 2080) & (df["code"] < 2090), "category"] = "Schools"

    # health 21xx -> Medical/Education -> Services
    df.loc[(df["code"] > 2100) & (df["code"] < 2200), "category"] = "Services"

    # leisure 22xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2200) & (df["code"] < 2300), "category"] = "Entertainment"

    # catering 23xx -> Life services -> Residential
    df.loc[(df["code"] > 2300) & (df["code"] < 2400), "category"] = "Residential"

    # accommodation 24xx -> Entertainment -> Hotel/resort
    df.loc[(df["code"] > 2400) & (df["code"] < 2500), "category"] = "Entertainment"

    # shopping 25xx -> Life services -> Shopping/retail
    df.loc[(df["code"] > 2500) & (df["code"] < 2600), "category"] = "Shopping"

    # money 26xx -> Financial services -> Services
    df.loc[(df["code"] > 2600) & (df["code"] < 2700), "category"] = "Services"

    # tourism 27xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 2700) & (df["code"] < 2800), "category"] = "Entertainment"

    # miscpoi 29xx -> Other facilities -> Others
    df.loc[(df["code"] > 2900) & (df["code"] < 3000), "category"] = "Others"
    df.loc[(df["code"] == 1500), "category"] = "Others"

    # pofw 3xxx -> Life services -> Civic/religious
    df.loc[(df["code"] > 3000) & (df["code"] < 4000), "category"] = "Civic"

    # natural 41xx -> Entertainment -> Recreation/entertainment
    df.loc[(df["code"] > 4000) & (df["code"] < 5000), "category"] = "Entertainment"

    # transport 56xx -> Other facilities -> Transportation
    df.loc[(df["code"] > 5600) & (df["code"] < 5700), "category"] = "Transportation"
    # traffic 54xx -> Other facilities -> Transportation
    df.loc[(df["code"] > 5200) & (df["code"] < 5400), "category"] = "Transportation"

    # Unknown           2737932
    # Others             127119
    # Entertainment       93521
    # Shopping            48116
    # Residential         42271
    # Transportation      39290
    # Services             9010
    # Schools              2850
    # Civic                 765

    print(df["category"].value_counts())
    return df

poi_category_gdf = _assign_category(poi_gdf)

category
Others            2699673
Entertainment      149791
Transportation     131733
Residential         50254
Shopping            47979
Services             9554
Schools              9142
Civic                6464
Name: count, dtype: int64


In [40]:
len(poi_category_gdf), poi_category_gdf["code"].value_counts().sum()

(3104590, 3104590)

## Final cleaning and safe

In [41]:
# final cleaning
poi_category_gdf.drop(columns=["osm_id", "fclass"], inplace=True)
# reindex
poi_category_gdf.reset_index(drop=True, inplace=True)
poi_category_gdf.index.name = "id"
poi_category_gdf.reset_index(inplace=True)

# change the projection and save
poi_category_gdf =poi_category_gdf.to_crs("EPSG:2056")
poi_category_gdf.to_file(os.path.join("..", "data", "poi", "final_pois.shp"))

## Get location poi pair

In [4]:
# read poi file
poi = gpd.read_file(os.path.join("..", "data", "poi", "final_pois.shp"))
spatial_index = poi.sindex

In [5]:
len(poi), poi["code"].value_counts().sum()

(3104590, 3104590)

In [9]:
def _get_inside_pois(df, poi, spatial_index):
    """
    Given one extent (df), return the poi within this extent.
    spatial_index is obtained from poi.sindex to speed up the process.
    """
    possible_matches_index = list(spatial_index.intersection(df.bounds))
    possible_matches = poi.iloc[possible_matches_index]
    precise_matches = possible_matches[possible_matches.within(df)]["id"].values

    return precise_matches

# get the inside poi within each location
tqdm.pandas(desc="Generating poi within")
loc["poi_within"] = loc["geometry"].progress_apply(
    _get_inside_pois, poi=poi, spatial_index=spatial_index
)

Generating poi within: 100%|████████████████████████████████████████████████████| 39177/39177 [01:32<00:00, 421.42it/s]


In [10]:
loc

Unnamed: 0_level_0,loc_id,level,freq,geometry,poi_within
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5152981090339651584,10,1,"POLYGON ((2827640.256 1154866.983, 2829448.495...","[46461, 46466, 46467, 47482, 46465, 47487, 144..."
1,5152983289362907136,10,0,"POLYGON ((2820309.073 1155559.663, 2822108.462...","[46462, 46460, 45903, 46464, 47479, 47481, 464..."
2,5152984406054404096,13,0,"POLYGON ((2822108.462 1164010.057, 2822333.574...",[]
3,5152984440414142464,13,0,"POLYGON ((2821191.630 1164096.842, 2821416.603...","[381046, 1883866]"
4,5152984474773880832,13,0,"POLYGON ((2821416.603 1165154.061, 2821641.617...",[1883867]
...,...,...,...,...,...
39172,5160199263916916736,13,0,"POLYGON ((2824428.986 1214705.568, 2824654.807...",[]
39173,5160199298276655104,13,0,"POLYGON ((2824203.206 1213639.007, 2824428.986...",[]
39174,5160199332636393472,13,0,"POLYGON ((2825125.225 1213550.831, 2825351.144...",[]
39175,5160199366996131840,13,0,"POLYGON ((2825351.144 1214617.359, 2825577.105...",[]


In [11]:
# cleaning and expanding to location_id-poi_id pair
loc_no_geo = loc.drop(columns="geometry")

# explode preserves nan - preserves locs with no poi
locs_poi = loc_no_geo.explode(column="poi_within")

In [12]:
# get the poi info from original poi df
locs_poi = locs_poi.merge(poi[["id", "category", "code"]], left_on="poi_within", right_on="id", how="left")
locs_poi.drop(columns=["id"], inplace=True)

In [13]:
locs_poi["code"].value_counts()

code
1500.0    2584175
2902.0      68205
5260.0      60568
2701.0      43132
5621.0      34700
           ...   
3301.0          7
3500.0          7
3106.0          6
2954.0          3
5652.0          2
Name: count, Length: 188, dtype: int64

In [14]:
# final cleaning
valid_pairs = locs_poi.dropna(subset=["poi_within"]).copy()
valid_pairs["code"] = valid_pairs["code"].astype(int).astype(str)

In [15]:
# some location does not have pois inside
len(valid_pairs["loc_id"].unique())

31494

## Get lda vector for locations

In [16]:
def _lda(df, categories=16):
    """Note: deal with the osm assigned "code" field."""
    texts = df.groupby("loc_id")["code"].apply(list).to_list()

    dct = Dictionary(texts)
    corpus = [dct.doc2bow(line) for line in texts]

    lda = LdaModel(corpus, num_topics=categories)
    vector = lda[corpus]

    # the lda array
    dense_ldavector = gensim.matutils.corpus2dense(vector, num_terms=categories).T
    # the index arr
    index_arr = df.groupby("loc_id", as_index=False).count()["loc_id"].values

    poiValues = pd.Series(list(dense_ldavector))
    poiValues.index = index_arr

    poiValues.name = "poiValues"
    poiValues.index.name = "loc_id"

    return poiValues.reset_index()

poiValues = _lda(valid_pairs, categories=16)

In [17]:
locs_poi = loc.merge(poiValues, on="loc_id", how="left")

In [18]:
# assign all 0's to non-poi locations
locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"] = locs_poi.loc[locs_poi["poiValues"].isna(), "poiValues"].apply(lambda x: np.zeros(16))

In [19]:
locs_poi["poiValues"] = locs_poi["poiValues"].apply(lambda x: np.array(x, dtype=np.float32))

In [20]:
locs_poi.drop(columns=["poi_within", "geometry"], inplace=True)

## Save loc poi file

In [21]:
locs_poi.to_csv(os.path.join("..", "data", "s2_loc_poi_level10_13.csv"), index=False)

In [22]:
with open(os.path.join("..", "data", "s2_loc_poi_level10_13.pk"), "wb") as handle:
    pickle.dump(locs_poi[["loc_id", "poiValues"]].to_dict("list"), handle, protocol=pickle.HIGHEST_PROTOCOL)

## Calculate distance

In [23]:
visited_loc = pd.read_csv(os.path.join("..", "data", "s2_loc_visited_level10_13.csv"), index_col="id")

In [24]:
visited_poiValues = locs_poi.loc[locs_poi["loc_id"].isin(visited_loc["loc_id"].values)].sort_values(by="loc_id")["poiValues"].values

visited_poiValues = np.array([np.array(xi) for xi in visited_poiValues])

In [25]:
visited_poiValues.shape

(14881, 16)

In [26]:
corr_distance = pdist(visited_poiValues, 'correlation')
corr_distance = squareform(corr_distance)
# nan values receive the highest distance
corr_distance[np.isnan(corr_distance)] = np.nanmax(corr_distance)

In [29]:
corr_distance.shape

(14881, 14881)

In [30]:
corr_distance.dtype

dtype('float64')

## Save

In [28]:
save_pk_file("../data/matrix/function_13.pk", corr_distance)