In [1]:
import geopandas as gpd
import numpy as np
import pandas as pd
import os
import json
from pathlib import Path
import datetime
from shapely import wkt
from shapely.geometry import LineString
from tqdm import tqdm

from joblib import Parallel, delayed
import multiprocessing

import trackintel as ti
from trackintel.analysis.tracking_quality import temporal_tracking_quality

In [2]:
ti.__version__

'1.2.4'

In [3]:
# read file storage
Dataset_file = os.path.join("..", "paths.json")
with open(Dataset_file) as json_file:
    CONFIG = json.load(json_file)

# Read staypoints

In [4]:
sp = pd.read_csv(os.path.join(CONFIG[f"raw_mobis"], "sps.csv"))

In [5]:
# geometry
sp["geometry"] = gpd.GeoSeries.from_wkt(sp["geometry"])
sp = gpd.GeoDataFrame(sp, crs="EPSG:4326", geometry="geometry")

In [6]:
sp["started_at"] = pd.to_datetime(sp["started_at"], format='mixed', yearfirst=True, utc=True)
sp["finished_at"] = pd.to_datetime(sp["finished_at"], format='mixed', yearfirst=True, utc=True)

In [7]:
# to trackintel
sp = ti.io.read_staypoints_gpd(sp)

In [8]:
type(sp)

trackintel.model.staypoints.Staypoints

# Read triplegs

In [9]:
# only a subset for testing
# tpls = pd.read_csv(os.path.join(CONFIG["raw_mobis"], "legs.csv"), nrows=100000)

# full, with mode 4
tpls = pd.read_csv(os.path.join(CONFIG["raw_mobis"], "legs.csv"), usecols=[0, 1, 3, 4, 6])
tpls["mode"] = tpls["mode"].apply(lambda x: x[6:])

In [10]:
# geometry
tpls["geometry"] = gpd.GeoSeries.from_wkt(tpls["geometry"])
tpls = gpd.GeoDataFrame(tpls, crs="EPSG:4326", geometry="geometry")

In [11]:
# construct linestring from multilinestring
def get_simple_line(multi):
    # multi = wkt.loads(str)
    multicoords = [list(line.coords) for line in multi.geoms]
    simple = LineString([item for sublist in multicoords for item in sublist])
    return simple

MultiLSFlag = tpls.geometry.type == "MultiLineString"
tpls.loc[MultiLSFlag, "geometry"] = tpls.loc[MultiLSFlag, "geometry"].apply(get_simple_line)

In [12]:
tpls["started_at"] = pd.to_datetime(tpls["started_at"], format='mixed', yearfirst=True, utc=True)
tpls["finished_at"] = pd.to_datetime(tpls["finished_at"], format='mixed', yearfirst=True, utc=True)

In [13]:
# to trackintel, filter invalid geometry
tpls = ti.io.read_triplegs_gpd(tpls[tpls.geometry.is_valid])

In [14]:
type(tpls)

trackintel.model.triplegs.Triplegs

# Final cleaning

In [15]:
tpls["user_id"].unique().shape, sp["user_id"].unique().shape

((5152,), (5130,))

In [16]:
# negative duration records have already been dropped
sp["duration"] = (sp["finished_at"] - sp["started_at"]).dt.total_seconds()
tpls["duration"] = (tpls["finished_at"] - tpls["started_at"]).dt.total_seconds()

In [17]:
sp = sp.sort_values(by="started_at").reset_index(drop=True)
tpls = tpls.sort_values(by="started_at").reset_index(drop=True)

sp.index.name = "id"
tpls.index.name = "id"

In [18]:
len(sp), len(tpls)

(4811539, 6391628)

In [19]:
tpls.head()

Unnamed: 0_level_0,started_at,finished_at,geometry,mode,user_id,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2019-09-01 00:43:39.546000+00:00,2019-09-01 00:45:48.144000+00:00,"LINESTRING (-123.08238 44.00756, -123.08245 44...",Walk,AQXET,128.598
1,2019-09-01 01:02:29.142999+00:00,2019-09-01 01:04:06.545000+00:00,"LINESTRING (-123.08249 44.00756, -123.08265 44...",Walk,AQXET,97.402001
2,2019-09-01 01:04:06.545000+00:00,2019-09-01 01:20:53.158999+00:00,"LINESTRING (-123.08280 44.00765, -123.08264 44...",Car,AQXET,1006.613999
3,2019-09-01 02:52:34.696000+00:00,2019-09-01 02:52:57.516000+00:00,"LINESTRING (-73.55908 45.58575, -73.45893 45.4...",Subway,LCGZG,22.82
4,2019-09-01 03:38:14.168999+00:00,2019-09-01 03:44:58.186000+00:00,"LINESTRING (-123.10317 44.05745, -123.10339 44...",Car,AQXET,404.017001


In [20]:
sp.head()

Unnamed: 0_level_0,started_at,finished_at,geometry,purpose,detected_purpose,confidence,user_id,overseas,misdetected_completely,duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2019-09-01 00:45:48.144000+00:00,2019-09-01 01:02:29.142999+00:00,POINT (-123.08249 44.00756),unknown,unknown,1.0,AQXET,True,False,1000.998999
1,2019-09-01 01:20:53.158999+00:00,2019-09-01 03:38:14.168999+00:00,POINT (-123.10317 44.05745),unknown,unknown,1.0,AQXET,True,False,8241.01
2,2019-09-01 02:52:57.516000+00:00,2019-09-01 18:18:04.576999+00:00,POINT (-73.45860 45.45644),unknown,unknown,1.0,LCGZG,True,False,55507.060999
3,2019-09-01 03:44:58.186000+00:00,2019-09-01 15:49:36.105000+00:00,POINT (-123.10147 44.03391),unknown,unknown,1.0,AQXET,True,False,43477.919
4,2019-09-01 08:46:09.986000+00:00,2019-09-01 09:05:17.973999+00:00,POINT (8.86188 46.03543),unknown,unknown,1.0,MMOQN,False,False,1147.987999


# Filter duplicates

In [21]:
def _alter_diff(df):
    df.sort_values(by="started_at", inplace=True)
    df["diff"] = pd.NA
    # for correct dtype
    df["st_next"] = df["started_at"]

    diff = df["started_at"].iloc[1:].reset_index(drop=True) - df["finished_at"].iloc[:-1].reset_index(drop=True)
    df["diff"].iloc[:-1] = diff.dt.total_seconds()
    df["st_next"].iloc[:-1] = df["started_at"].iloc[1:].reset_index(drop=True)

    df.loc[df["diff"] < 0, "finished_at"] = df.loc[df["diff"] < 0, "st_next"]

    df["started_at"], df["finished_at"] = pd.to_datetime(df["started_at"]), pd.to_datetime(df["finished_at"])
    df["duration"] = (df["finished_at"] - df["started_at"]).dt.total_seconds()

    # print(df.loc[df["diff"] < 0])
    df.drop(columns=["diff", "st_next"], inplace=True)
    df.drop(index=df[df["duration"] <= 0].index, inplace=True)

    return df

def filter_duplicates(sp, tpls):

    # merge trips and staypoints
    sp["type"] = "sp"
    tpls["type"] = "tpl"
    df_all = pd.merge(sp, tpls, how="outer")

    df_all = df_all.groupby("user_id", as_index=False).apply(_alter_diff)
    sp = df_all.loc[df_all["type"] == "sp"].drop(columns=["type"])
    tpls = df_all.loc[df_all["type"] == "tpl"].drop(columns=["type"])

    sp = sp[["id", "user_id", "started_at", "finished_at", "geometry", "duration", "purpose", "detected_purpose", "overseas"]]
    tpls = tpls[["id", "user_id", "started_at", "finished_at","duration", "mode", "geometry"]]

    return sp.set_index("id"), tpls.set_index("id")

sp, tpls = filter_duplicates(sp.reset_index(), tpls.reset_index())

In [22]:
len(sp), len(tpls)

(4804194, 6381957)

# Read quality filter file

In [23]:
quality_path = os.path.join("..","data", "quality")
quality_file = os.path.join(quality_path, "mobis_filtered.csv")
if Path(quality_file).is_file():
    valid_users = pd.read_csv(quality_file)["user_id"].values
else:
    if not os.path.exists(quality_path):
        os.makedirs(quality_path)

In [24]:
len(valid_users)

2113

# Define activity

In [25]:
sp["is_activity"] = True

# wait is not an activity
sp.loc[sp["purpose"] == "wait", "is_activity"] = False

# shorter than 25min
sp.loc[(sp["purpose"] == "unknown") & (sp["duration"] < 25 * 60), "is_activity"] = False

In [26]:
sp["is_activity"].value_counts()

is_activity
True     3589215
False    1214979
Name: count, dtype: int64

# Generate trips

In [27]:
# the trackintel trip generation
sp, tpls, trips = ti.preprocessing.triplegs.generate_trips(sp, tpls, gap_threshold=25, add_geometry=False)

In [28]:
len(sp), len(tpls), len(trips)

(4804194, 6381957, 3466359)

# Generate user filter

In [30]:
def _split_overlaps(source, granularity="day", max_iter=60):
    if granularity == "hour":
        # every split over hour splits also over day
        # this way to split of an entry over a month takes 30+24 iterations instead of 30*24.
        df = _split_overlaps(source, granularity="day", max_iter=max_iter)
    else:
        df = source.copy()

    change_flag = _get_split_index(df, granularity=granularity)
    iter_count = 0

    freq = "D" if granularity == "day" else "H"
    # Iteratively split one day/hour from multi day/hour entries until no entry spans over multiple days/hours
    while change_flag.sum() > 0:
        # calculate new finished_at timestamp (00:00 midnight)
        new_df = df.loc[change_flag].copy()
        # print(change_flag)
        # print(new_df)
        df.loc[change_flag, "finished_at"] = (df.loc[change_flag, "started_at"] + pd.Timestamp.resolution).dt.ceil(freq)

        # create new entries with remaining timestamp
        new_df["started_at"] = df.loc[change_flag, "finished_at"]

        df = pd.concat((df, new_df), ignore_index=True, sort=True)

        change_flag = _get_split_index(df, granularity=granularity)
        iter_count += 1
        if iter_count >= max_iter:
            break

    if "duration" in df.columns:
        df["duration"] = df["finished_at"] - df["started_at"]
    return df

def _get_split_index(df, granularity="day"):
    freq = "D" if granularity == "day" else "H"
    cond1 = df["started_at"].dt.floor(freq) != (df["finished_at"] - pd.Timedelta.resolution).dt.floor(freq)
    # catch corner case where both on same border and subtracting would lead to error
    cond2 = df["started_at"] != df["finished_at"]
    return cond1 & cond2
    
def _filter_user(df, min_thres, mean_thres):
    consider = df.loc[df["quality"] != 0]
    if (consider["quality"].min() > min_thres) and (consider["quality"].mean() > mean_thres):
        return df


def _get_tracking_quality(df, window_size):

    weeks = (df["finished_at"].max() - df["started_at"].min()).days // 7
    start_date = df["started_at"].min().date()

    quality_list = []
    # construct the sliding week gdf
    for i in range(0, weeks - window_size + 1):
        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())
        curr_end = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())

        # the total df for this time window
        cAll_gdf = df.loc[(df["started_at"] >= curr_start) & (df["finished_at"] < curr_end)]
        if cAll_gdf.shape[0] == 0:
            continue
        total_sec = (curr_end - curr_start).total_seconds()

        quality_list.append([i, cAll_gdf["duration"].sum() / total_sec])
    ret = pd.DataFrame(quality_list, columns=["timestep", "quality"])
    ret["user_id"] = df["user_id"].unique()[0]
    return ret

def calculate_user_quality(sp, trips, file_path, quality_filter):

    trips["started_at"] = pd.to_datetime(trips["started_at"]).dt.tz_localize(None)
    trips["finished_at"] = pd.to_datetime(trips["finished_at"]).dt.tz_localize(None)
    sp["started_at"] = pd.to_datetime(sp["started_at"]).dt.tz_localize(None)
    sp["finished_at"] = pd.to_datetime(sp["finished_at"]).dt.tz_localize(None)

    # merge trips and staypoints
    print("starting merge", sp.shape, trips.shape)
    sp["type"] = "sp"
    trips["type"] = "tpl"
    all_df = pd.concat([sp, trips])
    print("finished merge", all_df.shape)
    print("*" * 50)
    all_df = _split_overlaps(all_df, granularity="day")
    all_df["duration"] = (all_df["finished_at"] - all_df["started_at"]).dt.total_seconds()

    print(len(all_df["user_id"].unique()))

    # get quality
    total_quality = temporal_tracking_quality(all_df, granularity="all")
    # get tracking days
    total_quality["days"] = (
        all_df.groupby("user_id").apply(lambda x: (x["finished_at"].max() - x["started_at"].min()).days).values
    )
    # filter based on days
    user_filter_day = (
        total_quality.loc[(total_quality["days"] > quality_filter["day_filter"])]
        .reset_index(drop=True)["user_id"]
        .unique()
    )
    # filter based on sliding quality
    sliding_quality = (
        all_df.groupby("user_id")
        .apply(_get_tracking_quality, window_size=quality_filter["window_size"])
        .reset_index(drop=True)
    )

    filter_after_day = sliding_quality.loc[sliding_quality["user_id"].isin(user_filter_day)]

    if "min_thres" in quality_filter:
        # filter based on quanlity
        filter_after_day = (
            filter_after_day.groupby("user_id")
            .apply(_filter_user, min_thres=quality_filter["min_thres"], mean_thres=quality_filter["mean_thres"])
            .reset_index(drop=True)
            .dropna()
        )

    filter_after_user_quality = filter_after_day.groupby("user_id", as_index=False)["quality"].mean()

    print("final selected user", filter_after_user_quality.shape[0])
    filter_after_user_quality.to_csv(file_path, index=False)
    return filter_after_user_quality["user_id"].values

quality_filter = {"day_filter": 50, "window_size": 5, "min_thres": 0.5, "mean_thres": 0.6}
valid_users = calculate_user_quality(sp.copy().reset_index(), trips.copy().reset_index(), quality_file, quality_filter)

starting merge (4804194, 13) (3466356, 6)
finished merge (8270550, 16)
**************************************************
5168


  all_df.groupby("user_id")


final selected user 2113


# Filter
## valid users

In [29]:
sp = sp.loc[sp["user_id"].isin(valid_users)]
tpls = tpls.loc[tpls["user_id"].isin(valid_users)]
trips = trips.loc[trips["user_id"].isin(valid_users)]

In [30]:
len(sp["user_id"].unique()), len(tpls["user_id"].unique()), len(trips["user_id"].unique())

(2113, 2113, 2113)

In [31]:
len(sp), len(tpls), len(trips)

(1578245, 2133677, 1124205)

## Switzerland records

In [111]:
def _filter_within_swiss(stps, swissBound):
    """Spatial filtering of staypoints."""
    # save a copy of the original projection
    init_crs = stps.crs
    # project to projected system
    stps = stps.to_crs(swissBound.crs)

    ## parallel for speeding up
    stps["within"] = _apply_parallel(stps["geometry"], _apply_extract, swissBound)
    sp_swiss = stps[stps["within"] == True].copy()
    sp_swiss.drop(columns=["within"], inplace=True)

    return sp_swiss.to_crs(init_crs)
    
def _apply_extract(df, swissBound):
    """The func for _apply_parallel: judge whether inside a shp."""
    tqdm.pandas(desc="pandas bar")
    shp = swissBound["geometry"].to_numpy()[0]
    return df.progress_apply(lambda x: shp.contains(x))


def _apply_parallel(df, func, other, n=-1):
    """parallel apply for spending up."""
    if n is None:
        n = -1
    dflength = len(df)
    cpunum = multiprocessing.cpu_count()
    if dflength < cpunum:
        spnum = dflength
    if n < 0:
        spnum = cpunum + n + 1
    else:
        spnum = n or 1

    sp = list(range(dflength)[:: int(dflength / spnum + 0.5)])
    sp.append(dflength)
    slice_gen = (slice(*idx) for idx in zip(sp[:-1], sp[1:]))
    results = Parallel(n_jobs=n, verbose=0)(delayed(func)(df.iloc[slc], other) for slc in slice_gen)
    return pd.concat(results)

swissBoundary = gpd.read_file(os.path.join("..", "data", "swiss", "swiss.shp"))

print("Before spatial filtering: ", sp.shape[0])
sp_swiss = _filter_within_swiss(sp, swissBoundary)
print("After spatial filtering: ", sp_swiss.shape[0])

Before spatial filtering:  1578245
After spatial filtering:  1460189


In [112]:
# sp_swiss = sp

## Activity staypoints

In [113]:
sp_swiss_act = sp_swiss.loc[sp_swiss["is_activity"] == True]

In [114]:
len(sp_swiss_act)

1094017

# Assign travel modes

In [115]:
tpls["length"] = tpls.to_crs("EPSG:2056").length

In [116]:
#  get the number of triplegs for each trip
groupsize = tpls.groupby("trip_id").size().to_frame(name="triplegNum").reset_index()
tpls_num = tpls.merge(groupsize, on="trip_id")

In [117]:
# trips only with 1 triplegs
res1 = tpls_num.loc[tpls_num["triplegNum"] == 1][["trip_id", "length", "mode"]].copy()

# get the mode and length of remaining trips
remain = tpls_num.loc[tpls_num["triplegNum"] != 1].copy()

remain.sort_values(by="length", inplace=True, ascending=False)
mode = remain.groupby("trip_id").head(1).reset_index(drop=True)[["mode", "trip_id"]]

length = remain.groupby("trip_id")["length"].sum().reset_index()
res2 = mode.merge(length, on="trip_id")

# merge
res = pd.concat([res1, res2])

# cleaning
res.rename(columns={"trip_id": "id"}, inplace=True)
res.set_index("id", inplace=True)

In [118]:
# merge to trip df
trips_mode = trips.join(res, how="left")

In [119]:
encode_dict = {
    "Car": "Car",
    "Walk": "Walk",
    "Bicycle": "Bicycle",
    "Bus": "Bus",
    "LightRail": "Train",
    "Train": "Train",
    "Tram": "Tram",
    "RegionalTrain": "Train",
    "Ebicycle": "Bicycle",
    "MotorbikeScooter": "Car",
    "Motorbike": "Car",
    "Subway": "Tram",
    "Airplane": "Other",
    "Boat": "Other",
    "Ski": "Other",
    "TaxiUber": "Car",
    "CarsharingMobility": "Car",
    "Scooter": "Bicycle",
    "Cablecar": "Bus",
    "RidepoolingPikmi": "Car",
    "Etrottinett": "Bicycle",
    "Bikesharing": "Bicycle",
    "Escooter": "Bicycle",
    "Ferry": "Other",   
}
trips_mode["mode"] = trips_mode["mode"].apply(lambda x: encode_dict[x])

In [120]:
trips_mode["mode"].value_counts()

mode
Car        550068
Walk       367637
Train       77182
Bicycle     60633
Bus         44164
Tram        19667
Other        4854
Name: count, dtype: int64

In [121]:
trips_mode.head()

Unnamed: 0_level_0,user_id,started_at,finished_at,origin_staypoint_id,destination_staypoint_id,length,mode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
621,AAGAF,2019-10-10 05:43:17.674999+00:00,2019-10-10 06:14:49.141999+00:00,295826.0,303827.0,11615.408548,Car
622,AAGAF,2019-10-10 06:53:54.841000+00:00,2019-10-10 07:03:24.426000+00:00,303827.0,304385.0,2104.855858,Car
623,AAGAF,2019-10-10 08:18:20.864000+00:00,2019-10-10 11:10:24.605999+00:00,304385.0,307381.0,4847.706521,Walk
624,AAGAF,2019-10-10 13:54:34.799339+00:00,2019-10-10 14:30:45.187999+00:00,307381.0,310069.0,12621.909935,Car
625,AAGAF,2019-10-10 15:07:07.127239+00:00,2019-10-10 15:30:34.460999+00:00,310069.0,311096.0,10288.796297,Car


In [122]:
# combine with sp df
with_pre_trip = sp_swiss_act.loc[~sp_swiss_act["prev_trip_id"].isna()].copy()

with_pre_res = with_pre_trip.merge(trips_mode.reset_index()[["length", "mode", "id"]], how="left", left_on="prev_trip_id", right_on="id")

no_pre_trip = sp_swiss_act.loc[sp_swiss_act["prev_trip_id"].isna()].copy()
no_pre_trip["length"] = 0
no_pre_trip["mode"] = "None"

# concat result
sp_trip = pd.concat([with_pre_res, no_pre_trip]).drop(columns=["prev_trip_id", "next_trip_id", "trip_id", "id"])

In [123]:
len(sp_swiss_act), len(sp_trip), sp_trip["mode"].value_counts()

(1094017,
 1094017,
 mode
 Car        505835
 Walk       327049
 Train       72166
 None        69844
 Bicycle     57758
 Bus         41264
 Tram        17419
 Other        2682
 Name: count, dtype: int64)

In [169]:
sp_trip.sort_values(by=["user_id", "started_at"], inplace=True)
sp_trip.reset_index(drop=True, inplace=True)
sp_trip.index.name = "id"

In [170]:
sp_trip.head()

Unnamed: 0_level_0,detected_purpose,duration,finished_at,geometry,is_activity,overseas,purpose,started_at,user_id,length,mode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,unknown,65563.533999,2019-10-10 05:43:17.674999+00:00,POINT (7.56522 47.54562),True,False,unknown,2019-10-09 11:30:34.141000+00:00,AAGAF,0.0,
1,unknown,2345.699001,2019-10-10 06:53:54.841000+00:00,POINT (7.56376 47.54795),True,False,unknown,2019-10-10 06:14:49.141999+00:00,AAGAF,11615.408548,Car
2,unknown,4496.438,2019-10-10 08:18:20.864000+00:00,POINT (7.58173 47.55644),True,False,unknown,2019-10-10 07:03:24.426000+00:00,AAGAF,2104.855858,Car
3,unknown,9850.19334,2019-10-10 13:54:34.799339+00:00,POINT (7.58173 47.55641),True,False,unknown,2019-10-10 11:10:24.605999+00:00,AAGAF,4847.706521,Walk
4,unknown,19240.168,2019-10-11 11:36:29.907000+00:00,POINT (7.56513 47.54571),True,False,unknown,2019-10-11 06:15:49.739000+00:00,AAGAF,8667.529201,Bus


## Assign unknown travel modes

In [187]:
def assign_unknown_modes(df):
    df.loc[df["mode"]=="None", "mode"] = pd.NA
    df["mode"] = df["mode"].ffill(axis=0).bfill(axis=0)
    return df

sp_trip_fill_mode = sp_trip.groupby("user_id", as_index=False).apply(assign_unknown_modes).reset_index(drop=True)
sp_trip_fill_mode.index.name = "id"

In [188]:
len(sp_swiss_act), len(sp_trip_fill_mode), sp_trip_fill_mode["mode"].value_counts()

(1094017,
 1094017,
 mode
 Car        536895
 Walk       352159
 Train       78154
 Bicycle     61527
 Bus         43879
 Tram        18596
 Other        2807
 Name: count, dtype: int64)

# Generate locations

In [189]:
sp_locs, locs = sp_trip_fill_mode.as_staypoints.generate_locations(
    epsilon=20, num_samples=1, distance_metric="haversine", agg_level="dataset", n_jobs=-1
)

## Filter noise staypoints

In [190]:
sp_filter = sp_locs.loc[~sp_locs["location_id"].isna()].copy()
print("After filter non-location staypoints: ", sp_filter.shape[0])

After filter non-location staypoints:  1094017


## Save locations

In [192]:
locs = locs[~locs.index.duplicated(keep="first")]
filtered_locs = locs.loc[locs.index.isin(sp_filter["location_id"].unique())]

# locations without duplication, user_id have no meaning
filtered_locs.as_locations.to_csv(os.path.join("..", "data", f"loc.csv"))
print("Location size: ", sp_filter["location_id"].unique().shape[0], filtered_locs.shape[0])

Location size:  162303 162303


## merge staypoints

In [193]:
sp_filter = sp_filter[["user_id", "started_at", "finished_at", "geometry", "length", "mode", "location_id"]].reset_index(drop=True)
sp_filter.index.name = "id"

sp_merged = sp_filter.as_staypoints.merge_staypoints(
    triplegs=pd.DataFrame([]), max_time_gap="1min", agg={"location_id": "first", "mode":"first", "length":"sum", "geometry": "first"}
)
print("After staypoints merging: ", sp_merged.shape[0])

  sp_tpls = pd.concat([sp_merge, tpls_merge]).sort_values(by=["user_id", "started_at"])


After staypoints merging:  1079922


In [194]:
sp_merged.head()

Unnamed: 0_level_0,user_id,started_at,finished_at,location_id,mode,length,geometry
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,AAGAF,2019-10-09 11:30:34.141000+00:00,2019-10-10 05:43:17.674999+00:00,0,Car,0.0,POINT (7.56522 47.54562)
1,AAGAF,2019-10-10 06:14:49.141999+00:00,2019-10-10 06:53:54.841000+00:00,1,Car,11615.408548,POINT (7.56376 47.54795)
2,AAGAF,2019-10-10 07:03:24.426000+00:00,2019-10-10 08:18:20.864000+00:00,2,Car,2104.855858,POINT (7.58173 47.55644)
3,AAGAF,2019-10-10 11:10:24.605999+00:00,2019-10-10 13:54:34.799339+00:00,2,Walk,4847.706521,POINT (7.58173 47.55641)
4,AAGAF,2019-10-11 06:15:49.739000+00:00,2019-10-11 11:36:29.907000+00:00,0,Bus,8667.529201,POINT (7.56513 47.54571)


# Calculate staypoint duration and activity duration

In [195]:
sp_merged.sort_values(by=["user_id", "started_at"], inplace=True)

sp_merged["duration"] = (sp_merged["finished_at"] - sp_merged["started_at"]).dt.total_seconds() // 60

In [196]:
def get_act_duration(df):
    df["act_duration"] = pd.NA
    df["act_duration"] = (df["finished_at"].shift(-1) - df["finished_at"]).dt.total_seconds().shift(1)// 60
    
    df["act_duration"].iloc[0] = df["duration"].iloc[0]

    return df["act_duration"]

sp_merged["act_duration"] = sp_merged.groupby("user_id").apply(get_act_duration).values

In [197]:
sp_merged.head()

Unnamed: 0_level_0,user_id,started_at,finished_at,location_id,mode,length,geometry,duration,act_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,AAGAF,2019-10-09 11:30:34.141000+00:00,2019-10-10 05:43:17.674999+00:00,0,Car,0.0,POINT (7.56522 47.54562),1092.0,1092.0
1,AAGAF,2019-10-10 06:14:49.141999+00:00,2019-10-10 06:53:54.841000+00:00,1,Car,11615.408548,POINT (7.56376 47.54795),39.0,70.0
2,AAGAF,2019-10-10 07:03:24.426000+00:00,2019-10-10 08:18:20.864000+00:00,2,Car,2104.855858,POINT (7.58173 47.55644),74.0,84.0
3,AAGAF,2019-10-10 11:10:24.605999+00:00,2019-10-10 13:54:34.799339+00:00,2,Walk,4847.706521,POINT (7.58173 47.55641),164.0,336.0
4,AAGAF,2019-10-11 06:15:49.739000+00:00,2019-10-11 11:36:29.907000+00:00,0,Bus,8667.529201,POINT (7.56513 47.54571),320.0,1301.0


# Validate and save

In [198]:
print("User size: ", len(sp_merged["user_id"].unique()))

User size:  2112


In [199]:
sp_merged.to_csv(os.path.join("..", "data", f"sp_all.csv"))

# Generate small dataset

In [200]:
sp = pd.read_csv(os.path.join("..", "data", f"sp_all.csv"), index_col="id")

sp

Unnamed: 0_level_0,user_id,started_at,finished_at,location_id,mode,length,geometry,duration,act_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,AAGAF,2019-10-09 11:30:34.141000+00:00,2019-10-10 05:43:17.674999+00:00,0,Car,0.000000,POINT (7.565219252705515 47.545616383391014),1092.0,1092.0
1,AAGAF,2019-10-10 06:14:49.141999+00:00,2019-10-10 06:53:54.841000+00:00,1,Car,11615.408548,POINT (7.563759803278834 47.54794768304769),39.0,70.0
2,AAGAF,2019-10-10 07:03:24.426000+00:00,2019-10-10 08:18:20.864000+00:00,2,Car,2104.855858,POINT (7.581727914351441 47.5564374119498),74.0,84.0
3,AAGAF,2019-10-10 11:10:24.605999+00:00,2019-10-10 13:54:34.799339+00:00,2,Walk,4847.706521,POINT (7.581729846308135 47.55641482529889),164.0,336.0
4,AAGAF,2019-10-11 06:15:49.739000+00:00,2019-10-11 11:36:29.907000+00:00,0,Bus,8667.529201,POINT (7.565127647736598 47.54571219157556),320.0,1301.0
...,...,...,...,...,...,...,...,...,...
1094012,ZZYIC,2019-10-31 15:26:16.467000+00:00,2019-10-31 17:07:26.619999+00:00,11963,Train,32827.769123,POINT (8.779955508622193 47.37271491009289),101.0,182.0
1094013,ZZYIC,2019-10-31 17:10:49.924000+00:00,2019-10-31 19:02:57.200000+00:00,162302,Car,4333.471814,POINT (8.778435090438501 47.37405248736652),112.0,115.0
1094014,ZZYIC,2019-10-31 19:22:37.476999+00:00,2019-11-01 07:45:05.378999+00:00,162235,Car,0.000000,POINT (8.777547641952243 47.375817643427915),742.0,762.0
1094015,ZZYIC,2019-11-01 07:51:34.239000+00:00,2019-11-01 09:09:36.236000+00:00,162235,Car,4329.292186,POINT (8.77725754861859 47.375938470094724),78.0,84.0


In [203]:
np.random.seed(0)

selected_user = np.random.choice(sp["user_id"].unique(), 500, replace=False)
selected_user

array(['VZZAM', 'NSEWU', 'SDAJB', 'LCGQUCB', 'LZIMGWA', 'IHWEP', 'MPZFJ',
       'LVFRPHG', 'HGMAP', 'CLMUT', 'BDVLG', 'HSUHS', 'LITFYCJ',
       'LNGQHDJ', 'LNNJEVU', 'PHPOM', 'QKXSO', 'DGSPE', 'GJZXR',
       'LMLANBV', 'WOFJG', 'HQZVG', 'DTVCR', 'PVSHL', 'ZWWMG', 'DSPHZ',
       'BFCVL', 'RWBCM', 'RYXZB', 'LFJMGHT', 'RKHPL', 'SRYVJ', 'LGFRVXC',
       'LQSZMTA', 'NCCNT', 'XIBZX', 'NOSKP', 'LSXGDJC', 'LDHQVGB',
       'VMOGN', 'XRZPJ', 'DTTTZ', 'SLCTX', 'MPLLT', 'DBHIN', 'LLUBDNN',
       'MUMYH', 'OVAPK', 'UZRPW', 'LPJJH', 'XSQXM', 'GKZHD', 'LABRDMD',
       'LPVSTLB', 'GXJHH', 'GITBE', 'JEKZW', 'RWWSJ', 'EIIUK', 'ATCBP',
       'CDOZT', 'TCYGL', 'YYYDG', 'HATLY', 'KDGMQ', 'YNHAI', 'UZXDH',
       'DJRPT', 'ECMXT', 'VSFMS', 'MEMZJ', 'AHTYK', 'ZGBOE', 'ZTXQQ',
       'NWWCA', 'PKDMA', 'SGXSL', 'QDSXL', 'EXBKA', 'ODNKT', 'VLZBZ',
       'YPPBV', 'LPTXVGG', 'NQDWC', 'LQUAVKZ', 'LJNXIWU', 'FIYVM',
       'TOOWP', 'QOIUS', 'LIDMGKW', 'VRKML', 'FHQZN', 'LEQBLPC', 'RBHKK',
       'TGGVY', 

In [204]:
sp.loc[sp["user_id"].isin(selected_user)].to_csv(os.path.join("..", "data", f"sp_small.csv"))

# Generate all visited location dataset

In [154]:
import sys
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from utils.utils import load_data

In [155]:
sp = pd.read_csv(os.path.join("..", "data", "sp_all.csv"), index_col="id")
loc = pd.read_csv(os.path.join("..", "data", "loc_s2_level10_13.csv"), index_col="id")

In [158]:
sp = load_data(sp, loc)

In [159]:
sp.head()

Unnamed: 0,id,user_id,started_at,finished_at,mode,length,geometry,duration,act_duration,location_id,level,start_day,start_min,weekday
0,0,AAGAF,2019-10-09 11:30:34.141000,2019-10-10 05:43:17.674999,,0.0,POINT (7.565219252705515 47.545616383391014),1092.0,1092.0,5157106440787197952,13,0,690,2
1,1,AAGAF,2019-10-10 06:14:49.141999,2019-10-10 06:53:54.841000,Car,11615.408548,POINT (7.563759803278834 47.54794768304769),39.0,70.0,5157106337707982848,13,1,374,3
2,2,AAGAF,2019-10-10 07:03:24.426000,2019-10-10 08:18:20.864000,Car,2104.855858,POINT (7.581727914351441 47.5564374119498),74.0,84.0,5157107196701442048,13,1,423,3
3,3,AAGAF,2019-10-10 11:10:24.605999,2019-10-10 13:54:34.799339,Walk,4847.706521,POINT (7.581729846308135 47.55641482529889),164.0,336.0,5157107196701442048,13,1,670,3
4,4,AAGAF,2019-10-11 06:15:49.739000,2019-10-11 11:36:29.907000,Bus,8667.529201,POINT (7.565127647736598 47.54571219157556),320.0,1301.0,5157106440787197952,13,2,375,4


In [160]:
sp["level"].value_counts()

level
13    1078360
12       1283
11        256
10         23
Name: count, dtype: int64

In [161]:
len(sp["location_id"].unique())

14881

In [162]:
s2_loc = pd.read_csv(os.path.join("..", "data", "s2_loc_all_level10_13.csv"), index_col="id")

visited_loc = s2_loc.loc[s2_loc["loc_id"].isin(sp["location_id"].unique())].copy()

# final clearning
visited_loc.index = np.arange(len(visited_loc))
visited_loc.index.name = "id"

In [163]:
len(visited_loc)

14881

In [164]:
visited_loc.head()

Unnamed: 0_level_0,loc_id,level,geometry,freq,area
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5152981090339651584,10,POINT (10.371321613430059 46.54838775051633),1,POLYGON ((10.405097146225662 46.50637570678295...
1,5152984577853095936,13,POINT (10.343581795559412 46.62397697413108),38,POLYGON ((10.347820052132302 46.61871685680636...
2,5152985265047863296,13,POINT (10.256708153859828 46.62211103049422),1,POLYGON ((10.260953592346382 46.61685460820135...
3,5152985299407601664,13,POINT (10.260129868678595 46.63154902483473),1,POLYGON ((10.264376293749178 46.62629143145975...
4,5152985505566031872,13,POINT (10.299321120965867 46.63774991883432),1,POLYGON ((10.303565033130825 46.63249007077394...


In [165]:
visited_loc.to_csv(os.path.join("..", "data", "s2_loc_visited_level10_13.csv"))