In [46]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import trackintel as ti

# Step 1: Load the dataset
df = pd.read_csv('/home/nalin/master/Y4S1/SC4020/hiroshima_challengedata.csv')

# Step 2: Filter for the first 30 days and convert to 30-minute intervals
df = df[(df['d'] >= 0) & (df['d'] <= 5)] 


  df = pd.read_csv('/home/nalin/master/Y4S1/SC4020/hiroshima_challengedata.csv')


In [47]:
# Step 3: Upscale x and y coordinates to reflect 500m spatial resolution
df['x'] = df['x'] * 500  # Convert x to meters
df['y'] = df['y'] * 500  # Convert y to meters

In [48]:
max_y_row = df[df['y'] == df['y'].max()]
max_y_row

Unnamed: 0,uid,d,t,x,y,Unnamed: 5,Unnamed: 6,x.1,y.1,X,Unnamed: 10
26837,23,5,22,12000,100000,,,24,200,,
26838,23,5,27,12000,100000,,,24,200,,
26839,23,5,29,12000,100000,,,24,200,,
99155,90,1,27,6500,100000,,,13,200,,
99164,90,2,29,6000,100000,,,12,200,,
...,...,...,...,...,...,...,...,...,...,...,...
1022030,921,5,18,69000,100000,,,138,200,,
1022034,921,5,23,69000,100000,,,138,200,,
1033397,931,0,22,84500,100000,,,169,200,,
1033398,931,0,24,84000,100000,,,168,200,,


In [49]:
# Step 3: Prepare data with essential columns and compatible geometry
# Combine x, y coordinates into shapely Points
df['geometry'] = df.apply(lambda row: Point(row['x'], row['y']), axis=1)

# Rename columns to match trackintel's expected structure
df = df.rename(columns={'uid': 'user_id', 't': 'tracked_at'})

# Convert 'tracked_at' to datetime based on days and 30-minute intervals
# Assuming each day starts at a base time
df['tracked_at'] = pd.to_datetime(df['d'] * 24 * 3600 + df['tracked_at'] * 30 * 60, unit='s')


In [50]:
# Convert to a GeoDataFrame, set CRS to projected CRS, emulating meter-based data
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.set_crs("EPSG:3857", inplace=True)

Unnamed: 0,user_id,d,tracked_at,x,y,Unnamed: 5,Unnamed: 6,x.1,y.1,X,Unnamed: 10,geometry
0,0,0,1970-01-01 10:00:00,40000,49500,10:00:00 AM,,80,99,40000.0,49500.0,POINT (40000.000 49500.000)
1,0,0,1970-01-01 10:30:00,40500,48500,10:30:00 AM,,81,97,40500.0,48500.0,POINT (40500.000 48500.000)
2,0,0,1970-01-01 12:30:00,41500,51000,12:30:00 PM,,83,102,41500.0,51000.0,POINT (41500.000 51000.000)
3,0,0,1970-01-01 13:00:00,40000,50500,,,80,101,40000.0,50500.0,POINT (40000.000 50500.000)
4,0,0,1970-01-01 13:30:00,40000,50500,01:30:00 PM,,80,101,40000.0,50500.0,POINT (40000.000 50500.000)
...,...,...,...,...,...,...,...,...,...,...,...,...
1047711,946,5,1970-01-06 15:00:00,44500,46500,,,89,93,,,POINT (44500.000 46500.000)
1047712,946,5,1970-01-06 15:30:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000)
1047713,946,5,1970-01-06 16:00:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000)
1047714,946,5,1970-01-06 16:30:00,46500,47500,,,93,95,,,POINT (46500.000 47500.000)


In [51]:
# Assuming gdf has columns 'user_id', 'tracked_at', and 'geometry' as prepared in previous steps
gdf = gdf.rename(columns={'tracked_at': 'tracked_at', 'user_id': 'user_id'})  # Ensure columns align

# Directly localize tracked_at to Asia/Tokyo (JST)- trackintel requires a timezone to be specified 
gdf['tracked_at'] = pd.to_datetime(gdf['tracked_at'].dt.tz_localize('Asia/Tokyo')) 

In [52]:
gdf

Unnamed: 0,user_id,d,tracked_at,x,y,Unnamed: 5,Unnamed: 6,x.1,y.1,X,Unnamed: 10,geometry
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,10:00:00 AM,,80,99,40000.0,49500.0,POINT (40000.000 49500.000)
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,10:30:00 AM,,81,97,40500.0,48500.0,POINT (40500.000 48500.000)
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,12:30:00 PM,,83,102,41500.0,51000.0,POINT (41500.000 51000.000)
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,,,80,101,40000.0,50500.0,POINT (40000.000 50500.000)
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,01:30:00 PM,,80,101,40000.0,50500.0,POINT (40000.000 50500.000)
...,...,...,...,...,...,...,...,...,...,...,...,...
1047711,946,5,1970-01-06 15:00:00+09:00,44500,46500,,,89,93,,,POINT (44500.000 46500.000)
1047712,946,5,1970-01-06 15:30:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000)
1047713,946,5,1970-01-06 16:00:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000)
1047714,946,5,1970-01-06 16:30:00+09:00,46500,47500,,,93,95,,,POINT (46500.000 47500.000)


In [53]:
# set this GeoDataFrame in trackintel's format directly
positionfixes, staypoints = ti.preprocessing.positionfixes.generate_staypoints(gdf)
                                                                            #    method='sliding', 
                                                                            #    dist_threshold=100, # Adjust to 500 meters, consistent with the spatial reso
                                                                            #    time_threshold=5)    

print(staypoints)    

Empty GeoDataFrame
Columns: [user_id, started_at, finished_at, geometry]
Index: []




In [54]:
positionfixes

Unnamed: 0,user_id,d,tracked_at,x,y,Unnamed: 5,Unnamed: 6,x.1,y.1,X,Unnamed: 10,geometry,staypoint_id
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,10:00:00 AM,,80,99,40000.0,49500.0,POINT (40000.000 49500.000),
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,10:30:00 AM,,81,97,40500.0,48500.0,POINT (40500.000 48500.000),
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,12:30:00 PM,,83,102,41500.0,51000.0,POINT (41500.000 51000.000),
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,,,80,101,40000.0,50500.0,POINT (40000.000 50500.000),
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,01:30:00 PM,,80,101,40000.0,50500.0,POINT (40000.000 50500.000),
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047711,946,5,1970-01-06 15:00:00+09:00,44500,46500,,,89,93,,,POINT (44500.000 46500.000),
1047712,946,5,1970-01-06 15:30:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000),
1047713,946,5,1970-01-06 16:00:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000),
1047714,946,5,1970-01-06 16:30:00+09:00,46500,47500,,,93,95,,,POINT (46500.000 47500.000),


In [55]:
# Step 3: Generate Triplegs
gdf = gdf.rename(columns={'tracked_at': 'started_at'})

positionfixes, triplegs  = ti.preprocessing.generate_triplegs(
    positionfixes,
    # staypoints = staypoints,
    method='between_staypoints',  # Defines triplegs as movement between staypoints
    gap_threshold=30              # Maximum allowed gap in minutes
)   


  pfs["tripleg_id"] = pfs["tripleg_id"].ffill()


In [56]:
# # Ensure 'tracked_at' is timezone-aware in JST
# gdf['tracked_at'] = pd.to_datetime(gdf['tracked_at']).dt.tz_convert('Asia/Tokyo')

# # Rename 'tracked_at' to 'started_at' to match trackintel's requirement for triplegs
# gdf = gdf.rename(columns={'tracked_at': 'started_at'})

# # Generate triplegs with trackintel
# triplegs, positionfixes = ti.preprocessing.triplegs.generate_triplegs(
#     positionfixes=gdf,
#     staypoints=staypoints,
#     method='between_staypoints',
#     gap_threshold=15  # Maximum allowed gap in minutes
# )

# # Check the generated triplegs
# print(triplegs.head())


In [57]:
positionfixes

Unnamed: 0,user_id,d,tracked_at,x,y,Unnamed: 5,Unnamed: 6,x.1,y.1,X,Unnamed: 10,geometry,staypoint_id,tripleg_id
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,10:00:00 AM,,80,99,40000.0,49500.0,POINT (40000.000 49500.000),,0
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,10:30:00 AM,,81,97,40500.0,48500.0,POINT (40500.000 48500.000),,0
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,12:30:00 PM,,83,102,41500.0,51000.0,POINT (41500.000 51000.000),,1
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,,,80,101,40000.0,50500.0,POINT (40000.000 50500.000),,1
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,01:30:00 PM,,80,101,40000.0,50500.0,POINT (40000.000 50500.000),,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1047711,946,5,1970-01-06 15:00:00+09:00,44500,46500,,,89,93,,,POINT (44500.000 46500.000),,15437
1047712,946,5,1970-01-06 15:30:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000),,15437
1047713,946,5,1970-01-06 16:00:00+09:00,42000,47000,,,84,94,,,POINT (42000.000 47000.000),,15437
1047714,946,5,1970-01-06 16:30:00+09:00,46500,47500,,,93,95,,,POINT (46500.000 47500.000),,15437


In [58]:
triplegs

Unnamed: 0_level_0,user_id,started_at,finished_at,geom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,1970-01-01 10:00:00+09:00,1970-01-01 10:30:00+09:00,"LINESTRING (40000.000 49500.000, 40500.000 485..."
1,0,1970-01-01 12:30:00+09:00,1970-01-01 13:30:00+09:00,"LINESTRING (41500.000 51000.000, 40000.000 505..."
3,0,1970-01-01 18:30:00+09:00,1970-01-01 19:30:00+09:00,"LINESTRING (40000.000 50500.000, 40000.000 510..."
4,0,1970-01-01 21:30:00+09:00,1970-01-01 22:00:00+09:00,"LINESTRING (40000.000 48500.000, 40000.000 505..."
5,0,1970-01-02 14:30:00+09:00,1970-01-02 15:00:00+09:00,"LINESTRING (40000.000 50500.000, 40000.000 500..."
...,...,...,...,...
15433,946,1970-01-04 16:30:00+09:00,1970-01-04 18:30:00+09:00,"LINESTRING (40000.000 46500.000, 40000.000 465..."
15434,946,1970-01-05 06:30:00+09:00,1970-01-05 18:30:00+09:00,"LINESTRING (48000.000 44000.000, 43000.000 470..."
15435,946,1970-01-06 07:30:00+09:00,1970-01-06 08:30:00+09:00,"LINESTRING (47000.000 43000.000, 42500.000 395..."
15436,946,1970-01-06 10:30:00+09:00,1970-01-06 13:00:00+09:00,"LINESTRING (38000.000 34000.000, 38500.000 360..."


In [76]:
# Extract sequences of (x, y) coordinates from triplegs
tripleg_sequences = []

for idx, row in triplegs.iterrows():
    # Extract the LINESTRING geometry
    linestring = row['geom']
    
    # Get the list of (x, y) coordinates
    coords = list(linestring.coords)
    
    # Optional: Split long triplegs into shorter sub-triplegs
    max_length = 10  # Define maximum length of a tripleg sequence
    for i in range(0, len(coords), max_length):
        sub_coords = coords[i:i+max_length]
        if len(sub_coords) > 1:
            tripleg_sequences.append({
                'user_id': row['user_id'],
                'sequence': sub_coords
            })

# Convert to DataFrame
tripleg_sequences_df = pd.DataFrame(tripleg_sequences)


In [77]:
tripleg_sequences_df

Unnamed: 0,user_id,sequence
0,0,"[(40000.0, 49500.0), (40500.0, 48500.0)]"
1,0,"[(41500.0, 51000.0), (40000.0, 50500.0), (4000..."
2,0,"[(40000.0, 50500.0), (40000.0, 51000.0), (3950..."
3,0,"[(40000.0, 48500.0), (40000.0, 50500.0)]"
4,0,"[(40000.0, 50500.0), (40000.0, 50000.0)]"
...,...,...
15050,946,"[(40000.0, 46500.0), (40000.0, 46000.0), (4000..."
15051,946,"[(40000.0, 46500.0), (40000.0, 46500.0), (4000..."
15052,946,"[(47000.0, 43000.0), (42500.0, 39500.0), (3850..."
15053,946,"[(38000.0, 34000.0), (38500.0, 36000.0), (3850..."


In [78]:
from collections import defaultdict

# Group sequences by user
user_sequences = defaultdict(list)

for idx, row in tripleg_sequences_df.iterrows():
    user_id = row['user_id']
    sequence = row['sequence']
    # Convert coordinates to strings to use as items
    sequence_str = [str(coord) for coord in sequence]
    user_sequences[user_id].append(sequence_str)

# Prepare sequences for GSP
gsp_sequences = []

for user_id, sequences in user_sequences.items():
    # Flatten the sequences for each user into a single sequence
    user_sequence = []
    for seq in sequences:
        # Each seq is a list of coordinate strings
        user_sequence.extend(seq)
    gsp_sequences.append(user_sequence)

# Now gsp_sequences is a list of sequences, each sequence is a list of hashable items (strings)


In [79]:
gsp_sequences

[['(40000.0, 49500.0)',
  '(40500.0, 48500.0)',
  '(41500.0, 51000.0)',
  '(40000.0, 50500.0)',
  '(40000.0, 50500.0)',
  '(40000.0, 50500.0)',
  '(40000.0, 51000.0)',
  '(39500.0, 48000.0)',
  '(40000.0, 48500.0)',
  '(40000.0, 50500.0)',
  '(40000.0, 50500.0)',
  '(40000.0, 50000.0)',
  '(41000.0, 51000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(46000.0, 54500.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54500.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(39000.0, 49000.0)',
  '(38500.0, 48000.0)',
  '(39000.0, 49500.0)',
  '(40000.0, 50500.0)',
  '(45500.0, 54500.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 54000.0)',
  '(45500.0, 54500.0)',
  '(45500.0, 54500.0)',
  '(45500.0, 54500.0)',
  '(45000.0, 54000.0)',
  '(45500.0, 54500.0)',
  '(45500.0, 54500.0)',
  '(45000.0, 54000.0)',
  '(45000.0, 540

In [80]:
from pymining import seqmining


# Set minimum support (number of users a pattern must appear in)
min_support = 2  # Adjust based on your dataset size

# Run the GSP algorithm
freq_seqs = seqmining.freq_seq_enum(gsp_sequences, min_support)

# Convert results to a list
freq_seqs = list(freq_seqs)

: 

In [None]:
freq_seqs

In [None]:
for seq, support in freq_seqs:
    # Convert items back to coordinates
    coord_seq = [eval(item[0]) for item in seq]  # item[0] because each itemset contains one coordinate as a string
    print(f"Sequence: {coord_seq}, Support: {support}")

SyntaxError: '(' was never closed (<string>, line 1)