In [29]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import trackintel as ti

# Step 1: Load the dataset
df = pd.read_csv('/home/nalin/master/Y4S1/SC4020/hiroshima_challengedata.csv')

# Step 2: Filter for the first 30 days and convert to 30-minute intervals
df = df[(df['d'] >= 0) & (df['d'] <= 3)]


In [30]:
# Step 3: Upscale x and y coordinates to reflect 500m spatial resolution
df['x'] = df['x'] * 500  # Convert x to meters
df['y'] = df['y'] * 500  # Convert y to meters

In [31]:
max_y_row = df[df['y'] == df['y'].max()]
max_y_row

Unnamed: 0,uid,d,t,x,y
99155,90,1,27,6500,100000
99164,90,2,29,6000,100000
99169,90,3,13,6500,100000
132902,120,1,24,7500,100000
162156,146,1,13,40000,100000
...,...,...,...,...,...
23889369,24949,2,16,39500,100000
23889370,24949,2,26,39500,100000
23889371,24949,2,42,34000,100000
23889373,24949,3,11,34000,100000


In [32]:
# Step 3: Prepare data with essential columns and compatible geometry
# Combine x, y coordinates into shapely Points
df['geometry'] = df.apply(lambda row: Point(row['x'], row['y']), axis=1)

# Rename columns to match trackintel's expected structure
df = df.rename(columns={'uid': 'user_id', 't': 'tracked_at'})

# Convert 'tracked_at' to datetime based on days and 30-minute intervals
# Assuming each day starts at a base time
df['tracked_at'] = pd.to_datetime(df['d'] * 24 * 3600 + df['tracked_at'] * 30 * 60, unit='s')


In [33]:
# Convert to a GeoDataFrame, set CRS to projected CRS, emulating meter-based data
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.set_crs("EPSG:3857", inplace=True)

Unnamed: 0,user_id,d,tracked_at,x,y,geometry
0,0,0,1970-01-01 10:00:00,40000,49500,POINT (40000.000 49500.000)
1,0,0,1970-01-01 10:30:00,40500,48500,POINT (40500.000 48500.000)
2,0,0,1970-01-01 12:30:00,41500,51000,POINT (41500.000 51000.000)
3,0,0,1970-01-01 13:00:00,40000,50500,POINT (40000.000 50500.000)
4,0,0,1970-01-01 13:30:00,40000,50500,POINT (40000.000 50500.000)
...,...,...,...,...,...,...
23912738,24999,3,1970-01-04 18:30:00,29500,54000,POINT (29500.000 54000.000)
23912739,24999,3,1970-01-04 19:00:00,37500,62500,POINT (37500.000 62500.000)
23912740,24999,3,1970-01-04 19:30:00,43000,74500,POINT (43000.000 74500.000)
23912741,24999,3,1970-01-04 22:00:00,43500,74500,POINT (43500.000 74500.000)


In [None]:
# Assuming gdf has columns 'user_id', 'tracked_at', and 'geometry' as prepared in previous steps
gdf = gdf.rename(columns={'tracked_at': 'tracked_at', 'user_id': 'user_id'})  # Ensure columns align

# Directly localize tracked_at to Asia/Tokyo (JST)- trackintel requires a timezone to be specified 
gdf['tracked_at'] = pd.to_datetime(gdf['tracked_at'].dt.tz_localize('Asia/Tokyo')) 

# set this GeoDataFrame in trackintel's format directly
positionfixes, staypoints = ti.preprocessing.positionfixes.generate_staypoints(gdf, 
                                                                               method='sliding', 
                                                                               dist_threshold=500, # Adjust to 500 meters, consistent with the spatial reso
                                                                               time_threshold=5)    

print(staypoints)    

Empty GeoDataFrame
Columns: [user_id, started_at, finished_at, geometry]
Index: []




In [None]:
positionfixes

Unnamed: 0,user_id,d,tracked_at,x,y,geometry,staypoint_id
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,POINT (40000.000 49500.000),
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,POINT (40500.000 48500.000),
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,POINT (41500.000 51000.000),
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,POINT (40000.000 50500.000),
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,POINT (40000.000 50500.000),
...,...,...,...,...,...,...,...
23912738,24999,3,1970-01-04 18:30:00+09:00,29500,54000,POINT (29500.000 54000.000),
23912739,24999,3,1970-01-04 19:00:00+09:00,37500,62500,POINT (37500.000 62500.000),
23912740,24999,3,1970-01-04 19:30:00+09:00,43000,74500,POINT (43000.000 74500.000),
23912741,24999,3,1970-01-04 22:00:00+09:00,43500,74500,POINT (43500.000 74500.000),


In [36]:
# Step 3: Generate Triplegs
gdf = gdf.rename(columns={'tracked_at': 'started_at'})

triplegs, positionfixes = ti.preprocessing.generate_triplegs(
    positionfixes,
    staypoints = staypoints,
    method='between_staypoints',  # Defines triplegs as movement between staypoints
    gap_threshold=30              # Maximum allowed gap in minutes
)   


AssertionError: dtype of started_at is float64 but has to be tz aware datetime64

In [None]:
# # Ensure 'tracked_at' is timezone-aware in JST
# gdf['tracked_at'] = pd.to_datetime(gdf['tracked_at']).dt.tz_convert('Asia/Tokyo')

# # Rename 'tracked_at' to 'started_at' to match trackintel's requirement for triplegs
# gdf = gdf.rename(columns={'tracked_at': 'started_at'})

# # Generate triplegs with trackintel
# triplegs, positionfixes = ti.preprocessing.triplegs.generate_triplegs(
#     positionfixes=gdf,
#     staypoints=staypoints,
#     method='between_staypoints',
#     gap_threshold=15  # Maximum allowed gap in minutes
# )

# # Check the generated triplegs
# print(triplegs.head())


KeyError: 'tracked_at'

In [None]:
triplegs

Unnamed: 0,user_id,d,tracked_at,x,y,geometry,staypoint_id,tripleg_id
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,POINT (40000.00000 49500.00000),,0
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,POINT (40500.00000 48500.00000),,0
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,POINT (41500.00000 51000.00000),,1
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,POINT (40000.00000 50500.00000),,1
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,POINT (40000.00000 50500.00000),,1
...,...,...,...,...,...,...,...,...
23912668,24999,1,1970-01-02 15:30:00+09:00,40500,47000,POINT (40500.00000 47000.00000),,116308
23912669,24999,1,1970-01-02 16:00:00+09:00,30500,56000,POINT (30500.00000 56000.00000),,116308
23912670,24999,1,1970-01-02 16:30:00+09:00,41500,72000,POINT (41500.00000 72000.00000),,116308
23912671,24999,1,1970-01-02 17:00:00+09:00,42500,73000,POINT (42500.00000 73000.00000),,116308
