In [11]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import sys
from shapely import wkt
from tqdm import tqdm
import itertools
import datetime


from config import config

sys.path.append(os.path.join(os.getcwd(), "trackintel"))
import trackintel as ti

# data size

In [3]:
selected_user = pd.read_csv(os.path.join(config["results"], "SBB_user_window_filtered.csv"))["user_id"].unique()
print(selected_user.shape)

(93,)


In [15]:
tpls = pd.read_csv(os.path.join(config['S_raw2'], 'tpls.csv'))
tpls = tpls.loc[tpls['user_id'].isin(selected_user)]
print("Number of triplegs: {}".format(len(tpls["id"].unique())))

Number of triplegs: 344740


In [18]:
locs = pd.read_csv(os.path.join(config['S_proc'], 'locs_user_50.csv'))
locs = locs.loc[locs['userid'].isin(selected_user)]
print("Number of locations: {}".format(len(locs["id"].unique())))

stps_locs = pd.read_csv(os.path.join(config['S_proc'], 'stps_act_user_50.csv'))
stps_locs = stps_locs.loc[stps_locs['userid'].isin(selected_user)]
print("Number of locations: {}".format(len(stps_locs["locid"].unique())))

Number of locations: 46489
Number of locations: 46489


In [20]:
stps = pd.read_csv(os.path.join(config['S_raw2'], 'stps.csv'))
stps = stps.loc[stps['user_id'].isin(selected_user)]
print("Number of staypoints: {}".format(len(stps["id"].unique())))

Number of staypoints: 193637


In [22]:
trips = pd.read_csv(os.path.join(config['S_proc'], 'trips.csv'))
trips = trips.loc[trips['userid'].isin(selected_user)]
print("Number of trips: {}".format(len(trips["id"].unique())))

Number of trips: 181479


In [12]:
def get_stps():
    
    
    stps.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"},inplace=True)

    stps['startt'] = pd.to_datetime(stps['startt']).dt.tz_localize(None)
    stps['endt'] = pd.to_datetime(stps['endt']).dt.tz_localize(None)
    return stps

def get_trips():
    

    trips.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"}, inplace=True)

    trips['startt'] = pd.to_datetime(trips['startt']).dt.tz_localize(None)
    trips['endt'] = pd.to_datetime(trips['endt']).dt.tz_localize(None)
    return trips

locs = pd.read_csv(os.path.join(config['S_proc'], 'stps_act_user_50.csv'))
trips = pd.read_csv(os.path.join(config['S_proc'], 'trips.csv'))



In [11]:
print("Number of locations: {}".format(len(stps["locid"].unique())))
print("Number of trips: {}".format(len(trips["id"].unique())))


Number of locations: 63133
Number of trips: 245689


# intermodal trips


In [22]:
trips_df = pd.read_csv(os.path.join(config["S_proc"], "trips_forMainMode.csv"))

In [23]:
from joblib import Parallel, delayed
import multiprocessing

def applyParallel(dfGrouped, func):
    # multiprocessing.cpu_count()
    retLst = Parallel(n_jobs=multiprocessing.cpu_count())(delayed(func)(group) for _, group in tqdm(dfGrouped))
    return pd.concat(retLst)

def get_unique_mode_distance(df):
    df = df.iloc[0]
    modes = " ".join(df["mode_len"][1:-1].split()).split(" ")
    dist = " ".join(df["mode_lenProp"][1:-1].split()).split(" ")
    mode_df = pd.DataFrame([modes, dist]).T
    mode_df.columns=["Mode","Distance"]
    mode_df["Distance"] = mode_df["Distance"].astype("float")
    
    top2 = mode_df.groupby("Mode", as_index=False).sum().sort_values(by="Distance", ascending=False).head(2)
    res_dict = {}
    res_dict["id"] = df["id"]
    res_dict["mode_1"] = top2["Mode"].values[0]
    res_dict["modeProp_1"] = top2["Distance"].values[0]
    if len(top2) == 2:
        res_dict["mode_2"] = top2["Mode"].values[1]
        res_dict["modeProp_2"] = top2["Distance"].values[1]

    return pd.Series(res_dict).to_frame().T

# tqdm.pandas(desc="Extract mode")
# extracted_df = trips_df.progress_apply(get_unique_mode_distance, axis=1)
extracted_df = applyParallel(trips_df.groupby("id"), get_unique_mode_distance)
extracted_df.index = np.arange(len(extracted_df))
extracted_df = extracted_df.loc[(extracted_df["mode_1"] != "''") & (extracted_df["mode_2"] != "''")]

100%|████████████████████████████████████████████████████████████████████████| 245689/245689 [03:52<00:00, 1054.68it/s]


In [24]:
extracted_df

Unnamed: 0,id,mode_1,modeProp_1,mode_2,modeProp_2
0,0,'Mode::Train',0.936252,'Mode::Walk',0.0637482
1,1,'Mode::Train',0.944693,'Mode::Walk',0.0553069
2,2,'Mode::Walk',1,,
3,3,'Mode::Walk',1,,
4,4,'Mode::Car',1,,
...,...,...,...,...,...
245684,245684,'Mode::Car',0.794674,'Mode::Walk',0.205326
245685,245685,'Mode::Walk',0.573189,'Mode::Train',0.339477
245686,245686,'Mode::Car',1,,
245687,245687,'Mode::Car',1,,


In [34]:
inter_trip = extracted_df.loc[~extracted_df['mode_2'].isna()]

inter_trip = trips_df.loc[trips_df["id"].isin(inter_trip["id"])]
print(inter_trip.shape[0]/extracted_df.shape[0] *100)
inter_trip = inter_trip[["id", "mode_ls", "userid"]]

29.4221556520642


In [35]:
def _ifOnlyWalk(row):
    modes = set(row.split(","))
    walkExist = "Mode::Walk" in modes
    length = len(modes) == 1
    return walkExist & length
def _encode(ori_str, mode_dict):
    for mode, value in mode_dict.items():
        ori_str = ori_str.replace(mode, value)
    # join adjacent same mode
    return "".join(i for i, _ in itertools.groupby(ori_str.replace(",", "")))

mode_dict = {
    "Mode::Airplane": "",
    "Mode::Bicycle": "b",
    "Mode::Boat": "d",
    "Mode::Bus": "d",
    "Mode::Car": "e",
    "Mode::Coach": "f",
    "Mode::Ebicycle": "g",
    "Mode::Ecar": "h",
    "Mode::Ski": "",
    "Mode::Train": "i",
    "Mode::Tram": "j",
    "Mode::Walk": "",
}

ifOnlyWalk = inter_trip["mode_ls"].apply(_ifOnlyWalk)
inter_trip.loc[ifOnlyWalk, "mode"] = "k"
inter_trip.loc[~ifOnlyWalk, "mode"] = [_encode(i, mode_dict) for i in inter_trip.loc[~ifOnlyWalk, "mode_ls"].to_list()]

In [41]:
modeNum = inter_trip["mode"].apply(lambda x: len(x))
sum(modeNum==1)/len(inter_trip)

0.6449845753731653

## Change of top1 loc
As indicator of home change

In [3]:
def get_stps():
    stps = pd.read_csv(os.path.join(config['S_proc'], 'stps_act_user_50.csv'))
    
    stps.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"},inplace=True)

    stps['startt'] = pd.to_datetime(stps['startt']).dt.tz_localize(None)
    stps['endt'] = pd.to_datetime(stps['endt']).dt.tz_localize(None)
    return stps

stps = get_stps()
valid_user = pd.read_csv(config["results"] + "\\SBB_user_window_filtered.csv")["user_id"].unique()
valid_user = valid_user.astype(int)
stps = stps.loc[stps["userid"].isin(valid_user)]

print(len(stps['userid'].unique()))

93


In [4]:

def get_top1_changed_flag(df):
    window_size = 10

    weeks = (df["endt"].max() - df["startt"].min()).days // 7
    start_date = df["startt"].min().date()
    
    top1_ls = []
    for i in range(0, weeks - window_size + 1):
        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())
        curr_end = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())

        # current trip
        curr = df.loc[(df["startt"] >= curr_start) & (df["endt"] < curr_end)]
        
        top1_loc = curr.groupby("locid")["duration"].sum().sort_values(ascending=False).head(1).index.values[0]
        top1_ls.append(top1_loc)
    
    if np.unique(top1_ls).shape[0] == 1:
        return False
    else:
        return True

change_df = stps.groupby("userid").apply(get_top1_changed_flag)

In [5]:
changed_user = change_df.loc[change_df==True]

In [19]:
import pickle
def get_top_loc(df):
    window_size = 5

    weeks = (df["endt"].max() - df["startt"].min()).days // 7
    start_date = df["startt"].min().date()
    
    top_ls = []
    for i in range(0, weeks - window_size + 1):
        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())
        curr_end = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())

        # current trip
        curr = df.loc[(df["startt"] >= curr_start) & (df["endt"] < curr_end)]
        
        top_ls.extend(curr.groupby("locid")["duration"].sum().sort_values(ascending=False).head(1).index.values)
    return top_ls

changed_user = [1651, 1650, 1631, 1630, 1624, 1620, 1608]
user_dict = {}
for user in changed_user:
    print(user)
    user_stps = stps.loc[stps["userid"] == user]
    top_loc_ls = get_top_loc(user_stps)
    
    change_time = np.diff(top_loc_ls,n=1)
    print(top_loc_ls)
    change_time = change_time !=0
    user_dict[user] = change_time
    
    top_loc = np.unique(top_loc_ls)
    
    selected = stps.loc[stps["locid"].isin(top_loc)]
#     print(selected)
#     selected.to_csv(os.path.join(config['S_vis'], f'{user}_stps.csv'))
#     selected['geom'] = selected['geom'].apply(wkt.loads)
#     gdf = gpd.GeoDataFrame(selected, geometry='geom')
#     gdf.set_crs("EPSG:4326", inplace=True)
#     gdf["startt"] = gdf["startt"].astype(str)
#     gdf["endt"] = gdf["endt"].astype(str)
#     gdf.to_file(os.path.join(config['S_vis'], f'{user}_stps.shp'))
#     break
#     print("*"*50)
#     print(user_stps)

print(user_dict)
# with open("home_change.pkl", "wb") as f:
#     pickle.dump(user_dict, f, pickle.HIGHEST_PROTOCOL)

1651
[32894, 32894, 32894, 32894, 32894, 32894, 32894, 32894, 32894, 32894, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905, 32905]
1650
[17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17294, 17560, 17560, 17560, 17560, 17560, 17560, 17560, 17560, 17749, 17749, 17749, 17749, 17749, 17749, 17749]
1631
[13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13042, 13042, 13042, 13042, 13042, 13038, 13038, 13038, 13038, 13038, 13038, 13038, 13

## Extract typical user

In [16]:
def get_stps():
    stps = pd.read_csv(os.path.join(config['S_proc'], 'stps_act_user_50.csv'))
    
    stps.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"},inplace=True)

    stps['startt'] = pd.to_datetime(stps['startt']).dt.tz_localize(None)
    stps['endt'] = pd.to_datetime(stps['endt']).dt.tz_localize(None)
    return stps

def get_trips():
    trips = pd.read_csv(os.path.join(config['S_proc'], 'trips.csv'))

    trips.rename(columns={"user_id": "userid", "started_at": "startt", "finished_at": "endt"}, inplace=True)

    trips['startt'] = pd.to_datetime(trips['startt']).dt.tz_localize(None)
    trips['endt'] = pd.to_datetime(trips['endt']).dt.tz_localize(None)
    return trips


stps = get_stps()
trips = get_trips()

# valid_user = pd.read_csv(config["results"] + "\\SBB_user_window_filtered.csv")["user_id"].unique()
# valid_user = valid_user.astype(int)
# stps = stps.loc[stps["userid"].isin(valid_user)]
# trips = trips.loc[trips["userid"].isin(valid_user)]
user = 1659
stps = stps.loc[stps["userid"] == user]
trips = trips.loc[trips["userid"] == user]

print(len(stps['userid'].unique()), len(trips['userid'].unique()))

1 1


In [17]:
def get_time_step(df):
    window_size = 5

    weeks = (df["endt"].max() - df["startt"].min()).days // 7
    start_date = df["startt"].min().date()
    
    df["timestep"] = 0
    top_ls = []
    for i in range(0, weeks - window_size + 1):
        curr_start = datetime.datetime.combine(start_date + datetime.timedelta(weeks=i), datetime.time())
        curr_end = datetime.datetime.combine(curr_start + datetime.timedelta(weeks=window_size), datetime.time())

        df.loc[(df["startt"] >= curr_start) & (df["endt"] < curr_end), "timestep"] = i
        
    return df

stps = get_time_step(stps)
trips = get_time_step(trips)

stps = stps.loc[stps["timestep"]>45]
trips = trips.loc[trips["timestep"]>45]

In [19]:
def to_geopandas(df):
    df['geom'] = df['geom'].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, geometry='geom')
    gdf.set_crs("EPSG:4326", inplace=True)
    gdf["startt"] = gdf["startt"].astype(str)
    gdf["endt"] = gdf["endt"].astype(str)

    return gdf

stps.to_csv(os.path.join(config['S_vis'], f'{user}_stps.csv'))
trips.to_csv(os.path.join(config['S_vis'], f'{user}_trips.csv'))

to_geopandas(stps).to_file(os.path.join(config['S_vis'], f'{user}_stps.shp'))
# to_geopandas(trips).to_file(os.path.join(config['S_vis'], f'{user}_trips.shp'))


TypeError: Only str is accepted.