In [12]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import trackintel as ti

# Step 1: Load the dataset
df = pd.read_csv('data/cityB_challengedata.csv')

# Step 2: Filter for the first 30 days and convert to 30-minute intervals
df = df[(df['d'] >= 0) & (df['d'] <= 5)] 


DEBUG:__init__:matplotlib data path: /opt/anaconda3/lib/python3.12/site-packages/matplotlib/mpl-data
DEBUG:__init__:CONFIGDIR=/Users/meredydd/.matplotlib
DEBUG:__init__:interactive is False
DEBUG:__init__:platform is darwin
DEBUG:__init__:CACHEDIR=/Users/meredydd/.matplotlib
DEBUG:font_manager:Using fontManager instance from /Users/meredydd/.matplotlib/fontlist-v390.json


In [14]:
# Step 3: Upscale x and y coordinates to reflect 500m spatial resolution
df['x'] = df['x'] * 500  # Convert x to meters
df['y'] = df['y'] * 500  # Convert y to meters

In [16]:
max_y_row = df[df['y'] == df['y'].max()]
max_y_row

Unnamed: 0,uid,d,t,x,y
26837,23,5,22,12000,100000
26838,23,5,27,12000,100000
26839,23,5,29,12000,100000
99155,90,1,27,6500,100000
99164,90,2,29,6000,100000
...,...,...,...,...,...
23889384,24949,4,23,34000,100000
23889388,24949,5,18,34000,100000
23889389,24949,5,31,34000,100000
23889390,24949,5,33,34000,100000


In [18]:
# Step 3: Prepare data with essential columns and compatible geometry
# Combine x, y coordinates into shapely Points
df['geometry'] = df.apply(lambda row: Point(row['x'], row['y']), axis=1)

# Rename columns to match trackintel's expected structure
df = df.rename(columns={'uid': 'user_id', 't': 'tracked_at'})

# Convert 'tracked_at' to datetime based on days and 30-minute intervals
# Assuming each day starts at a base time
df['tracked_at'] = pd.to_datetime(df['d'] * 24 * 3600 + df['tracked_at'] * 30 * 60, unit='s')


In [19]:
# Convert to a GeoDataFrame, set CRS to projected CRS, emulating meter-based data
gdf = gpd.GeoDataFrame(df, geometry='geometry')
gdf.set_crs("EPSG:3857", inplace=True)

Unnamed: 0,user_id,d,tracked_at,x,y,geometry
0,0,0,1970-01-01 10:00:00,40000,49500,POINT (40000.000 49500.000)
1,0,0,1970-01-01 10:30:00,40500,48500,POINT (40500.000 48500.000)
2,0,0,1970-01-01 12:30:00,41500,51000,POINT (41500.000 51000.000)
3,0,0,1970-01-01 13:00:00,40000,50500,POINT (40000.000 50500.000)
4,0,0,1970-01-01 13:30:00,40000,50500,POINT (40000.000 50500.000)
...,...,...,...,...,...,...
23912790,24999,5,1970-01-06 16:00:00,31500,50000,POINT (31500.000 50000.000)
23912791,24999,5,1970-01-06 16:30:00,30500,50500,POINT (30500.000 50500.000)
23912792,24999,5,1970-01-06 17:00:00,31500,56000,POINT (31500.000 56000.000)
23912793,24999,5,1970-01-06 17:30:00,41000,69500,POINT (41000.000 69500.000)


In [20]:
# Assuming gdf has columns 'user_id', 'tracked_at', and 'geometry' as prepared in previous steps
gdf = gdf.rename(columns={'tracked_at': 'tracked_at', 'user_id': 'user_id'})  # Ensure columns align

# Directly localize tracked_at to Asia/Tokyo (JST)- trackintel requires a timezone to be specified 
gdf['tracked_at'] = pd.to_datetime(gdf['tracked_at'].dt.tz_localize('Asia/Tokyo')) 

In [21]:
gdf

Unnamed: 0,user_id,d,tracked_at,x,y,geometry
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,POINT (40000.000 49500.000)
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,POINT (40500.000 48500.000)
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,POINT (41500.000 51000.000)
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,POINT (40000.000 50500.000)
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,POINT (40000.000 50500.000)
...,...,...,...,...,...,...
23912790,24999,5,1970-01-06 16:00:00+09:00,31500,50000,POINT (31500.000 50000.000)
23912791,24999,5,1970-01-06 16:30:00+09:00,30500,50500,POINT (30500.000 50500.000)
23912792,24999,5,1970-01-06 17:00:00+09:00,31500,56000,POINT (31500.000 56000.000)
23912793,24999,5,1970-01-06 17:30:00+09:00,41000,69500,POINT (41000.000 69500.000)


In [109]:
coord_list = []
for i in gdf["geometry"].unique():
    coord = (i.x, i.y)
    coord_list.append(coord)

for idx, value in enumerate(coord_list):
    print(idx, value)
    break

0 (40000.0, 49500.0)


In [23]:
# set this GeoDataFrame in trackintel's format directly
positionfixes, staypoints = ti.preprocessing.positionfixes.generate_staypoints(gdf,
                                                                               method='sliding', 
                                                                               dist_threshold = 500, # Adjust to 500 meters, consistent with the spatial reso
                                                                               time_threshold = 60)    



In [24]:
positionfixes

Unnamed: 0,user_id,d,tracked_at,x,y,geometry,longitude,latitude,staypoint_id
0,0,0,1970-01-01 10:00:00+09:00,40000,49500,POINT (40000.000 49500.000),40000.0,49500.0,
1,0,0,1970-01-01 10:30:00+09:00,40500,48500,POINT (40500.000 48500.000),40500.0,48500.0,
2,0,0,1970-01-01 12:30:00+09:00,41500,51000,POINT (41500.000 51000.000),41500.0,51000.0,
3,0,0,1970-01-01 13:00:00+09:00,40000,50500,POINT (40000.000 50500.000),40000.0,50500.0,
4,0,0,1970-01-01 13:30:00+09:00,40000,50500,POINT (40000.000 50500.000),40000.0,50500.0,
...,...,...,...,...,...,...,...,...,...
23912790,24999,5,1970-01-06 16:00:00+09:00,31500,50000,POINT (31500.000 50000.000),31500.0,50000.0,
23912791,24999,5,1970-01-06 16:30:00+09:00,30500,50500,POINT (30500.000 50500.000),30500.0,50500.0,
23912792,24999,5,1970-01-06 17:00:00+09:00,31500,56000,POINT (31500.000 56000.000),31500.0,56000.0,
23912793,24999,5,1970-01-06 17:30:00+09:00,41000,69500,POINT (41000.000 69500.000),41000.0,69500.0,


In [25]:
# Step 3: Generate Triplegs
gdf = gdf.rename(columns={'tracked_at': 'started_at'})

positionfixes, triplegs  = ti.preprocessing.generate_triplegs(
    positionfixes,
    # staypoints = staypoints,
    method='between_staypoints',  # Defines triplegs as movement between staypoints
    gap_threshold=30              # Maximum allowed gap in minutes
)   

  pfs["tripleg_id"] = pfs["tripleg_id"].ffill()


In [27]:
triplegs

Unnamed: 0_level_0,user_id,started_at,finished_at,geom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,1970-01-01 10:00:00+09:00,1970-01-01 10:30:00+09:00,"LINESTRING (40000.000 49500.000, 40500.000 485..."
1,0,1970-01-01 12:30:00+09:00,1970-01-01 13:30:00+09:00,"LINESTRING (41500.000 51000.000, 40000.000 505..."
3,0,1970-01-01 18:30:00+09:00,1970-01-01 19:30:00+09:00,"LINESTRING (40000.000 50500.000, 40000.000 510..."
4,0,1970-01-01 21:30:00+09:00,1970-01-01 22:00:00+09:00,"LINESTRING (40000.000 48500.000, 40000.000 505..."
5,0,1970-01-02 14:30:00+09:00,1970-01-02 15:00:00+09:00,"LINESTRING (40000.000 50500.000, 40000.000 500..."
...,...,...,...,...
392586,24999,1970-01-05 11:30:00+09:00,1970-01-05 15:30:00+09:00,"LINESTRING (23500.000 69000.000, 23500.000 695..."
392588,24999,1970-01-05 18:30:00+09:00,1970-01-05 20:00:00+09:00,"LINESTRING (43500.000 75500.000, 43500.000 745..."
392589,24999,1970-01-06 02:30:00+09:00,1970-01-06 10:00:00+09:00,"LINESTRING (43500.000 75000.000, 45000.000 695..."
392590,24999,1970-01-06 11:00:00+09:00,1970-01-06 12:00:00+09:00,"LINESTRING (34500.000 49000.000, 37500.000 500..."


In [117]:
# Extract sequences of (x, y) coordinates from triplegs
tripleg_sequences = []

for idx, row in triplegs.iterrows():
    # Extract the LINESTRING geometry
    linestring = row['geom']
    
    # Get the list of (x, y) coordinates
    coords = list(linestring.coords)
    
    # Optional: Split long triplegs into shorter sub-triplegs
    max_length = 10  # Define maximum length of a tripleg sequence
    for i in range(0, len(coords), max_length):
        sub_coords = coords[i:i+max_length]
        if len(sub_coords) > 1:
            tripleg_sequences.append({
                'user_id': row['user_id'],
                'sequence': sub_coords
            })

# Convert to DataFrame
tripleg_sequences_df = pd.DataFrame(tripleg_sequences)

In [141]:
# Create a mapping from coordinates to their indices
coord_index_map = {value: index for index, value in enumerate(coord_list)}

# Define a function to get gsp_sequence
def get_gsp_sequence(sequence):
    return [coord_index_map[item] for item in sequence if item in coord_index_map]

# Apply the function to the 'sequence' column
tripleg_sequences_df["gsp_sequence"] = tripleg_sequences_df["sequence"].apply(get_gsp_sequence)

# Display the updated DataFrame
tripleg_sequences_df.head()

Unnamed: 0,user_id,sequence,gsp_sequence
0,0,"[(40000.0, 49500.0), (40500.0, 48500.0)]","[0, 1]"
1,0,"[(41500.0, 51000.0), (40000.0, 50500.0), (4000...","[2, 3, 3]"
2,0,"[(40000.0, 50500.0), (40000.0, 51000.0), (3950...","[3, 4, 5]"
3,0,"[(40000.0, 48500.0), (40000.0, 50500.0)]","[6, 3]"
4,0,"[(40000.0, 50500.0), (40000.0, 50000.0)]","[3, 7]"


In [60]:
from collections import defaultdict

# Group sequences by user
user_sequences = dict()

for idx, row in tripleg_sequences_df.iterrows():
    user_id = row['user_id']
    sequence = row['sequence']
    # Convert coordinates to strings to use as items
    sequence_str = [str(coord) for coord in sequence]
    if user_sequences.get(user_id) is None:
        user_sequences[user_id] = []
    
    user_sequences[user_id].append(sequence_str)

# Prepare sequences for GSP
gsp_sequences = []

for user_id, sequences in user_sequences.items():
    # Flatten the sequences for each user into a single sequence
    user_sequence = []
    for seq in sequences:
        # Each seq is a list of coordinate strings
        user_sequence.extend(seq)

    gsp_sequences.append(user_sequence)

# Now gsp_sequences is a list of sequences, each sequence is a list of hashable items (strings)


In [153]:
user_sequences = dict()

for idx, row in tripleg_sequences_df.iterrows():
    user_id = row['user_id']
    gsp_sequence = row['gsp_sequence']
    
    if user_sequences.get(user_id) is None:
        user_sequences[user_id] = []
    
    user_sequences[user_id].append(gsp_sequence)

gsp_data = []
for v in user_sequences.values():
    gsp_data.append(v)

gsp_data

[[[0, 1],
  [2, 3, 3],
  [3, 4, 5],
  [6, 3],
  [3, 7],
  [8, 9, 9, 9, 9, 9, 9, 10],
  [9, 9, 9, 11, 9, 9],
  [12, 13, 14, 3],
  [16, 9, 9, 9, 16],
  [16, 16, 9, 16],
  [16, 9, 9],
  [17, 9, 2],
  [8, 18, 3],
  [8, 9, 16, 10, 9, 9, 15, 9],
  [9, 16, 16, 9, 16, 17, 17, 16],
  [19, 20],
  [21, 19, 9, 16, 9],
  [9, 9, 17, 9, 16, 15, 9],
  [19, 16, 15, 19, 9, 19, 3, 3, 3, 3]],
 [[22, 23], [24, 25], [28, 29, 30, 31, 31], [32, 25, 33, 25, 25], [35, 36]],
 [[37, 37, 38, 37, 38],
  [38, 37, 39, 40],
  [37, 38, 37, 38, 37],
  [37, 37, 37, 42, 43, 37],
  [38, 37, 37, 37, 38],
  [38, 37, 38, 37],
  [44, 45, 46, 47],
  [49, 40, 50],
  [38, 50],
  [50, 38, 50, 38, 50],
  [38, 50, 38, 50],
  [51, 50],
  [38, 50],
  [38, 50, 50],
  [38, 38, 38, 50, 38, 50, 38, 50, 38, 38],
  [50, 50],
  [52, 52, 38, 50],
  [38, 50, 38, 38, 50, 50, 50, 38],
  [50, 38, 53, 54, 55, 56, 52, 50, 38, 50],
  [38, 57]],
 [[58, 59, 60, 61],
  [62, 63],
  [64, 65, 66, 67],
  [62, 68, 69, 70],
  [71, 72, 62, 73, 62],
  [74, 75,

In [None]:
from pymining import seqmining

# # Set minimum support (number of users a pattern must appear in)
min_support = 2  # Adjust based on your dataset size

# # Run the GSP algorithm
freq_seqs = seqmining.freq_seq_enum(gsp_sequences, min_support)

# # Convert results to a list
freq_seqs = list(freq_seqs)

In [155]:
from gsp_python.gsp import GSP

In [161]:
gsp = GSP(gsp_data, minsup = 2)
output = gsp.run_gsp()

In [163]:
output

[]

In [143]:
from gsp_python.dataset_gen import DatasetGenerator

algo_dsgen = DatasetGenerator(size=100, nevents=8, maxevents=4, avgelems=16)
algo_dsgen.generate_sequence_dataset()

[[[3, 7, 8],
  [6],
  [1, 4, 5],
  [6],
  [8],
  [2, 4, 6],
  [4],
  [1, 4, 5, 7],
  [5, 8],
  [5, 7, 8],
  [3, 7, 8],
  [5],
  [2, 5, 6, 8],
  [5, 7, 8],
  [3, 4, 7, 8],
  [6],
  [4, 5, 7, 8]],
 [[2],
  [2, 7],
  [3],
  [6, 8],
  [2, 3, 5, 7],
  [5, 6],
  [2, 4, 8],
  [1, 4, 5, 8],
  [3, 7],
  [2],
  [4, 5, 7],
  [1, 4],
  [5],
  [5],
  [1, 3, 8]],
 [[5, 7, 8],
  [3, 5, 6],
  [2, 5, 7],
  [2],
  [3, 5, 6, 8],
  [2, 6, 7],
  [3, 8],
  [1, 4],
  [3, 6, 8],
  [1, 2, 5],
  [4, 8],
  [2, 4, 7],
  [1, 5, 8],
  [6, 7],
  [2, 3, 5, 7]],
 [[7],
  [1, 2, 6],
  [4, 5, 7],
  [2, 6],
  [1, 3, 5],
  [1, 7],
  [3, 4, 6, 7],
  [5, 7, 8],
  [3, 6, 7, 8],
  [1, 5],
  [2, 4, 5],
  [5],
  [4],
  [2, 5, 6, 7],
  [1, 8]],
 [[4, 5],
  [1, 2, 5],
  [5],
  [1, 5, 8],
  [1, 4, 8],
  [5],
  [4, 6],
  [2, 5, 6],
  [2, 3, 4, 6],
  [2, 5, 6, 7],
  [2, 7, 8],
  [3, 8],
  [6, 7],
  [2, 8],
  [2, 6, 7]],
 [[1, 2, 5],
  [1, 6, 7],
  [3, 4, 5, 7],
  [8],
  [1, 2, 3, 8],
  [3],
  [2, 5, 6, 7],
  [6, 7],
  [1, 6],
  [2, 