In [1]:
import numpy as np
import geopandas as gpd
import pandas as pd

import glob, sys, os
import datetime

from tqdm import tqdm

In [2]:
from utils.config import config

sys.path.append(os.path.join(os.getcwd(), "trackintel"))
from trackintel.analysis.tracking_quality import temporal_tracking_quality

In [3]:
def get_stps():
    stps = pd.read_csv(os.path.join(config['proc'], 'stps_act_user_50.csv'))
    
    stps.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"},inplace=True)

    stps['startt'] = pd.to_datetime(stps['startt']).dt.tz_localize(None)
    stps['endt'] = pd.to_datetime(stps['endt']).dt.tz_localize(None)
    return stps

def get_trips():
    trips = pd.read_csv(os.path.join(config['proc'], 'trips.csv'))

    trips.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"}, inplace=True)

    trips['startt'] = pd.to_datetime(trips['startt']).dt.tz_localize(None)
    trips['endt'] = pd.to_datetime(trips['endt']).dt.tz_localize(None)
    return trips

def _preprocess(df):
    df.rename(
        columns={"userid": "user_id", "startt": "started_at", "endt": "finished_at", "dur_s": "duration"}, inplace=True
    )
    return df

def _get_all_trace(stps, trips):

    stps = _preprocess(stps)
    trips = _preprocess(trips)
    print("User number:", len(stps["user_id"].unique()), len(trips["user_id"].unique()))

    # merge trips and staypoints
    print("starting merge", stps.shape, trips.shape)
    stps["type"] = "stp"
    trips["type"] = "trip"
    df_all = pd.merge(stps, trips, how="outer")
    print("finished merge", df_all.shape)
    print("*"*50)

    return df_all

stps = get_stps()
trips = get_trips()
all_trace = _get_all_trace(stps, trips)

print(len(stps['user_id'].unique()), len(trips['user_id'].unique()), len(all_trace['user_id'].unique()))

User number: 139 139
starting merge (259887, 12) (245689, 9)
finished merge (505576, 17)
**************************************************
139 139 139


# Determine the user filter
## Overall

In [4]:
# get the total quality and tracked days
total_quality = temporal_tracking_quality(all_trace, granularity="all")
total_quality['days'] = all_trace.groupby("user_id").apply(lambda x: (x['finished_at'].max() - x['started_at'].min()).days).values

total_quality.sort_values(by='quality', ascending=False, inplace=True)

# select total quality and tracked days
selected = total_quality.loc[(total_quality['days']>300) & (total_quality['quality']>0.7)].reset_index(drop=True)
# save
selected.to_csv(os.path.join(config["quality"], "SBB_user_filtered.csv"), index=False)

print(selected)

     user_id   quality  days
0       1617  0.992109   430
1       1602  0.987011   434
2       1721  0.986556   421
3       1673  0.986205   415
4       1716  0.985793   421
..       ...       ...   ...
111     1802  0.720105   406
112     1628  0.717939   415
113     1609  0.716239   421
114     1601  0.716053   342
115     1760  0.701097   415

[116 rows x 3 columns]


## Sliding window based
Ensure high tracking quality throughout the study

In [5]:
def filter_user(df):
    consider = df.loc[df['quality']!=0]
    if consider['quality'].min() > 0.6:
        return df

def getTrackingQuality(df, window_size):

    weeks = (df['finished_at'].max() - df['started_at'].min()).days // 7
    start_date = df['started_at'].min().date()

    quality_list = []
    # construct the sliding week gdf
    for i in range(0, weeks-window_size):
        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())
        curr_end  = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())

        # the total df for this time window
        cAll_gdf = df.loc[(df['started_at'] >= curr_start) & (df['finished_at'] < curr_end)]
        if cAll_gdf.shape[0] == 0:
            continue
        total_sec = (curr_end-curr_start).total_seconds()

        quality_list.append([i, cAll_gdf['duration'].sum()/total_sec])
    ret = pd.DataFrame(quality_list, columns=['timestep','quality'])
    ret["user_id"] = df["user_id"].unique()[0]
    return ret

sliding = all_trace.groupby("user_id").apply(getTrackingQuality, window_size=10).reset_index(drop=True)

# use selected as a filter 
sliding = sliding.loc[sliding['user_id'].isin(selected['user_id'].unique())]


slide_user = sliding.groupby("user_id").apply(filter_user).reset_index(drop=True).dropna()
slide_user = slide_user.groupby("user_id", as_index=False)["quality"].mean()

# save
slide_user.to_csv(os.path.join(config["quality"], "SBB_user_window_filtered.csv"), index=False)

print(slide_user)

    user_id   quality
0    1596.0  0.973634
1    1597.0  0.966084
2    1602.0  0.978700
3    1605.0  0.922213
4    1606.0  0.959640
..      ...       ...
88   1805.0  0.777991
89   1810.0  0.872272
90   1812.0  0.958226
91   1848.0  0.954729
92   1934.0  0.870983

[93 rows x 2 columns]
