In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

for city D

In [3]:
# load data
mobility_data = pd.read_csv("kumamoto_challengedata.csv")

# Load POI distribution data
poi_data = pd.read_csv("POIdata_cityD.csv")

# Load POI category mappings
poi_categories = pd.read_csv("POI_datacategories.csv", header=None, names=['category_name'])
poi_categories['category_id'] = range(1, len(poi_categories) + 1)

# Load Task 1 output (frequent itemsets)
task1_output = pd.read_csv("frequent_itemsets_cityD.csv")

# Load Task 2 output (frequent patterns)
task2_patterns = pd.read_csv('kumamoto_freq_subseq.csv')['Patterns'].apply(eval).tolist()

In [4]:
# Merge POI categories with distribution data
poi_data = pd.merge(poi_data, poi_categories, left_on='category', right_on='category_id', how='left')


In [5]:
# Filter mobility data to the first 30 days
mobility_data['d'] = pd.to_numeric(mobility_data['d'], errors='coerce')
mobility_data = mobility_data[mobility_data['d'] <= 30]

In [6]:
# Remove rows with missing coordinates
mobility_data = mobility_data[~((mobility_data['x'] == -999) & (mobility_data['y'] == -999))]


In [7]:
# Calculate frequent support for each POI using Task 1
def calculate_frequent_support(poi_category, task1_data):
    return task1_data.loc[
        task1_data['itemsets'].apply(lambda s: poi_category in s), 'support'
    ].sum()

poi_data['frequent_support'] = poi_data['category_name'].apply(
    lambda x: calculate_frequent_support(x, task1_output)
)


In [8]:
# Aggregate POI data by grid
poi_aggregated = poi_data.groupby(['x', 'y']).agg(
    total_poi_count=('POI_count', 'sum'),
    avg_frequent_support=('frequent_support', 'mean')
).reset_index()

In [9]:
# Merge POI features with mobility data
mobility_data = pd.merge(
    mobility_data, poi_aggregated,
    on=['x', 'y'], how='left'
).fillna(0)  # Fill missing POI features with 0

In [10]:
# Precompute POI features as a NumPy array
poi_features = ['total_poi_count', 'avg_frequent_support']
poi_features_array = mobility_data[poi_features].to_numpy()

# Sort the entire dataset once by user and time
mobility_data = mobility_data.sort_values(['uid', 'd', 't']).reset_index(drop=True)

# Convert task2_patterns to a set of tuples for fast lookup
task2_patterns_set = set(map(tuple, task2_patterns))

# Function to check if a sequence matches a pattern (optimized for set lookup)
def match_pattern(sequence, patterns_set):
    return 1 if tuple(map(tuple, sequence)) in patterns_set else 0

# Function to process a single sequence for multiprocessing
def process_sequence(data):
    seq, match, poi_features_array = data
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match]) for i, step in enumerate(seq)
    ]
    return enriched_seq

# Group by user
grouped = mobility_data.groupby('uid')

# Initialize list for enhanced sequences
enhanced_sequences = []

# Sequence length for LSTM
sequence_length = 10

In [11]:
# Process each group with tqdm for progress tracking
for _, group in tqdm(grouped, desc="Processing User Groups"):
    # Convert group to NumPy array for faster slicing
    group_array = group[['x', 'y']].to_numpy()

    # Generate sequences
    num_sequences = len(group_array) - sequence_length
    if num_sequences > 0:
        sequences = np.lib.stride_tricks.sliding_window_view(group_array, (sequence_length, group_array.shape[1]))[:, 0]
        matches = [match_pattern(seq, task2_patterns_set) for seq in sequences]
        enhanced_sequences.extend(zip(sequences, matches))

Processing User Groups: 100%|██████████| 5983/5983 [00:14<00:00, 405.87it/s]


In [12]:
# Normalize features
max_x, max_y = mobility_data['x'].max(), mobility_data['y'].max()
mobility_data['x'] = mobility_data['x'] / max_x
mobility_data['y'] = mobility_data['y'] / max_y

poi_features = ['total_poi_count', 'avg_frequent_support']
mobility_data[poi_features] = mobility_data[poi_features] / mobility_data[poi_features].max()


In [13]:
# Updated Loop
final_sequences = []
for seq, match in tqdm(enhanced_sequences, desc="Creating LSTM Sequences"):
    # Combine sequence with precomputed POI features and pattern match
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match])  # Use preloaded features
        for i, step in enumerate(seq)
    ]
    final_sequences.append(enriched_seq)

final_sequences = np.array(final_sequences)

Creating LSTM Sequences: 100%|██████████| 3452950/3452950 [01:24<00:00, 40635.23it/s]


In [14]:
np.savez('city_D_processed_data_optimized.npz', sequences=final_sequences)

Task 1 output = frequent itemsets to calculate frequent_support for each grid (x, y).

Task 2 output = frequent sequence Add a feature to indicate whether a user sequence matches any frequent pattern.

Filtered the main dataset to 30 days

Combine the enriched POI features and pattern matches with the main data.
Prepare sequences of fixed length for LSTM training.


In [15]:
# for city a

# load data
mobility_data = pd.read_csv("task1_dataset_kotae.csv")

# Load POI distribution data
poi_data = pd.read_csv("POIdata_cityA.csv")

# Load POI category mappings
poi_categories = pd.read_csv("POI_datacategories.csv", header=None, names=['category_name'])
poi_categories['category_id'] = range(1, len(poi_categories) + 1)

# Load Task 1 output (frequent itemsets)
task1_output = pd.read_csv("frequent_itemsets_cityA.csv")

# Load Task 2 output (frequent patterns)
task2_patterns = pd.read_csv('kotae_freq_subseq.csv')['Patterns'].apply(eval).tolist()

In [16]:
# Merge POI categories with distribution data
poi_data = pd.merge(poi_data, poi_categories, left_on='category', right_on='category_id', how='left')

# Filter mobility data to the first 30 days
mobility_data['d'] = pd.to_numeric(mobility_data['d'], errors='coerce')
mobility_data = mobility_data[mobility_data['d'] <= 30]

# Remove rows with missing coordinates
mobility_data = mobility_data[~((mobility_data['x'] == -999) & (mobility_data['y'] == -999))]

# Calculate frequent support for each POI using Task 1
def calculate_frequent_support(poi_category, task1_data):
    return task1_data.loc[
        task1_data['itemsets'].apply(lambda s: poi_category in s), 'support'
    ].sum()

poi_data['frequent_support'] = poi_data['category_name'].apply(
    lambda x: calculate_frequent_support(x, task1_output)
)

# Aggregate POI data by grid
poi_aggregated = poi_data.groupby(['x', 'y']).agg(
    total_poi_count=('POI_count', 'sum'),
    avg_frequent_support=('frequent_support', 'mean')
).reset_index()

# Merge POI features with mobility data
mobility_data = pd.merge(
    mobility_data, poi_aggregated,
    on=['x', 'y'], how='left'
).fillna(0)  # Fill missing POI features with 0

# Precompute POI features as a NumPy array
poi_features = ['total_poi_count', 'avg_frequent_support']
poi_features_array = mobility_data[poi_features].to_numpy()

# Sort the entire dataset once by user and time
mobility_data = mobility_data.sort_values(['uid', 'd', 't']).reset_index(drop=True)

# Convert task2_patterns to a set of tuples for fast lookup
task2_patterns_set = set(map(tuple, task2_patterns))

# Function to check if a sequence matches a pattern (optimized for set lookup)
def match_pattern(sequence, patterns_set):
    return 1 if tuple(map(tuple, sequence)) in patterns_set else 0

# Function to process a single sequence for multiprocessing
def process_sequence(data):
    seq, match, poi_features_array = data
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match]) for i, step in enumerate(seq)
    ]
    return enriched_seq

# Group by user
grouped = mobility_data.groupby('uid')

# Initialize list for enhanced sequences
enhanced_sequences = []
# Sequence length for LSTM
sequence_length = 10

In [17]:
# Process each group with tqdm for progress tracking
for _, group in tqdm(grouped, desc="Processing User Groups"):
    # Convert group to NumPy array for faster slicing
    group_array = group[['x', 'y']].to_numpy()

    # Generate sequences
    num_sequences = len(group_array) - sequence_length
    if num_sequences > 0:
        sequences = np.lib.stride_tricks.sliding_window_view(group_array, (sequence_length, group_array.shape[1]))[:, 0]
        matches = [match_pattern(seq, task2_patterns_set) for seq in sequences]
        enhanced_sequences.extend(zip(sequences, matches))

Processing User Groups: 100%|██████████| 99773/99773 [03:11<00:00, 521.42it/s]


In [18]:
# Normalize features
max_x, max_y = mobility_data['x'].max(), mobility_data['y'].max()
mobility_data['x'] = mobility_data['x'] / max_x
mobility_data['y'] = mobility_data['y'] / max_y

poi_features = ['total_poi_count', 'avg_frequent_support']
mobility_data[poi_features] = mobility_data[poi_features] / mobility_data[poi_features].max()

In [19]:
batch_size = 1000000  # Adjust based on available memory
final_sequences = []  # Temporary storage for the current batch
batch_index = 0  # To track batch numbers

for idx, (seq, match) in enumerate(tqdm(enhanced_sequences, desc="Creating LSTM Sequences")):
    # Process the current sequence
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match])  # Combine step with features
        for i, step in enumerate(seq)
    ]
    final_sequences.append(enriched_seq)

    
    if (idx + 1) % batch_size == 0:
        # Convert batch to NumPy array
        sequences = np.array(final_sequences)
        
        # Save batch to disk
        np.savez(f'city_A_batch_{batch_index}.npz', sequences=sequences)
        print(f"Batch {batch_index} saved with {len(final_sequences)} sequences.")
        
        # Reset for the next batch
        final_sequences = []
        batch_index += 1

# Save the remaining sequences as the last batch
if final_sequences:
    sequences = np.array(final_sequences)
    np.savez(f'city_A_batch_{batch_index}.npz', sequences=sequences)
    print(f"Final batch {batch_index} saved with {len(final_sequences)} sequences.")

Creating LSTM Sequences:   2%|▏         | 1009490/44382897 [00:29<1:39:28, 7267.13it/s]

Batch 0 saved with 1000000 sequences.


Creating LSTM Sequences:   5%|▍         | 2005535/44382897 [00:57<1:57:00, 6035.89it/s]

Batch 1 saved with 1000000 sequences.


Creating LSTM Sequences:   7%|▋         | 3007823/44382897 [01:26<1:46:36, 6468.48it/s] 

Batch 2 saved with 1000000 sequences.


Creating LSTM Sequences:   9%|▉         | 4008766/44382897 [01:54<1:20:03, 8404.61it/s]

Batch 3 saved with 1000000 sequences.


Creating LSTM Sequences:  11%|█▏        | 5007325/44382897 [02:22<1:42:17, 6415.92it/s] 

Batch 4 saved with 1000000 sequences.


Creating LSTM Sequences:  14%|█▎        | 6009035/44382897 [02:51<1:13:16, 8727.77it/s]

Batch 5 saved with 1000000 sequences.


Creating LSTM Sequences:  16%|█▌        | 7001513/44382897 [03:18<2:12:50, 4690.05it/s]

Batch 6 saved with 1000000 sequences.


Creating LSTM Sequences:  18%|█▊        | 8009134/44382897 [03:47<1:09:15, 8753.39it/s]

Batch 7 saved with 1000000 sequences.


Creating LSTM Sequences:  20%|██        | 9008313/44382897 [04:15<1:09:01, 8540.53it/s]

Batch 8 saved with 1000000 sequences.


Creating LSTM Sequences:  23%|██▎       | 10005595/44382897 [04:43<1:26:14, 6643.17it/s]

Batch 9 saved with 1000000 sequences.


Creating LSTM Sequences:  25%|██▍       | 11006910/44382897 [05:10<1:29:12, 6235.56it/s]

Batch 10 saved with 1000000 sequences.


Creating LSTM Sequences:  27%|██▋       | 12006427/44382897 [05:39<1:24:42, 6370.03it/s]

Batch 11 saved with 1000000 sequences.


Creating LSTM Sequences:  29%|██▉       | 13005257/44382897 [06:08<1:20:51, 6467.51it/s]

Batch 12 saved with 1000000 sequences.


Creating LSTM Sequences:  32%|███▏      | 14005122/44382897 [06:36<1:16:48, 6590.99it/s]

Batch 13 saved with 1000000 sequences.


Creating LSTM Sequences:  34%|███▍      | 15007848/44382897 [07:04<1:15:40, 6469.91it/s]

Batch 14 saved with 1000000 sequences.


Creating LSTM Sequences:  36%|███▌      | 16009571/44382897 [07:32<55:30, 8520.19it/s]  

Batch 15 saved with 1000000 sequences.


Creating LSTM Sequences:  38%|███▊      | 17007879/44382897 [08:00<1:08:29, 6661.16it/s]

Batch 16 saved with 1000000 sequences.


Creating LSTM Sequences:  41%|████      | 18005607/44382897 [08:28<1:09:51, 6292.42it/s]

Batch 17 saved with 1000000 sequences.


Creating LSTM Sequences:  43%|████▎     | 19007698/44382897 [08:55<1:04:11, 6587.67it/s]

Batch 18 saved with 1000000 sequences.


Creating LSTM Sequences:  45%|████▌     | 20009012/44382897 [09:24<48:29, 8377.31it/s]  

Batch 19 saved with 1000000 sequences.


Creating LSTM Sequences:  47%|████▋     | 21005985/44382897 [09:51<57:20, 6794.30it/s]  

Batch 20 saved with 1000000 sequences.


Creating LSTM Sequences:  50%|████▉     | 22004690/44382897 [10:19<58:11, 6409.88it/s]  

Batch 21 saved with 1000000 sequences.


Creating LSTM Sequences:  52%|█████▏    | 23006772/44382897 [10:46<53:44, 6628.91it/s]  

Batch 22 saved with 1000000 sequences.


Creating LSTM Sequences:  54%|█████▍    | 24004093/44382897 [11:15<59:41, 5690.15it/s]  

Batch 23 saved with 1000000 sequences.


Creating LSTM Sequences:  56%|█████▋    | 25005020/44382897 [11:42<56:37, 5704.29it/s]  

Batch 24 saved with 1000000 sequences.


Creating LSTM Sequences:  59%|█████▊    | 26006325/44382897 [12:10<47:45, 6413.89it/s]  

Batch 25 saved with 1000000 sequences.


Creating LSTM Sequences:  61%|██████    | 27004405/44382897 [12:39<47:26, 6105.05it/s]  

Batch 26 saved with 1000000 sequences.


Creating LSTM Sequences:  63%|██████▎   | 28006009/44382897 [13:07<48:25, 5636.94it/s]  

Batch 27 saved with 1000000 sequences.


Creating LSTM Sequences:  65%|██████▌   | 29002844/44382897 [13:35<55:11, 4644.61it/s] 

Batch 28 saved with 1000000 sequences.


Creating LSTM Sequences:  68%|██████▊   | 30004925/44382897 [14:04<37:45, 6347.76it/s] 

Batch 29 saved with 1000000 sequences.


Creating LSTM Sequences:  70%|██████▉   | 31005446/44382897 [14:31<34:40, 6428.49it/s] 

Batch 30 saved with 1000000 sequences.


Creating LSTM Sequences:  72%|███████▏  | 32006177/44382897 [14:59<30:00, 6872.78it/s] 

Batch 31 saved with 1000000 sequences.


Creating LSTM Sequences:  74%|███████▍  | 33008477/44382897 [15:26<27:55, 6787.65it/s] 

Batch 32 saved with 1000000 sequences.


Creating LSTM Sequences:  77%|███████▋  | 34007761/44382897 [15:54<25:03, 6900.59it/s] 

Batch 33 saved with 1000000 sequences.


Creating LSTM Sequences:  79%|███████▉  | 35006099/44382897 [16:21<23:23, 6680.85it/s] 

Batch 34 saved with 1000000 sequences.


Creating LSTM Sequences:  81%|████████  | 36005854/44382897 [16:49<21:03, 6628.34it/s] 

Batch 35 saved with 1000000 sequences.


Creating LSTM Sequences:  83%|████████▎ | 37005639/44382897 [17:17<18:57, 6484.27it/s] 

Batch 36 saved with 1000000 sequences.


Creating LSTM Sequences:  86%|████████▌ | 38004742/44382897 [17:45<16:14, 6544.15it/s] 

Batch 37 saved with 1000000 sequences.


Creating LSTM Sequences:  88%|████████▊ | 39007478/44382897 [18:14<14:10, 6318.86it/s] 

Batch 38 saved with 1000000 sequences.


Creating LSTM Sequences:  90%|█████████ | 40006823/44382897 [18:41<11:08, 6544.78it/s] 

Batch 39 saved with 1000000 sequences.


Creating LSTM Sequences:  92%|█████████▏| 41008233/44382897 [19:09<08:34, 6562.13it/s] 

Batch 40 saved with 1000000 sequences.


Creating LSTM Sequences:  95%|█████████▍| 42006356/44382897 [19:37<06:00, 6590.51it/s] 

Batch 41 saved with 1000000 sequences.


Creating LSTM Sequences:  97%|█████████▋| 43007482/44382897 [20:05<03:35, 6373.35it/s] 

Batch 42 saved with 1000000 sequences.


Creating LSTM Sequences:  99%|█████████▉| 44006045/44382897 [20:33<00:57, 6536.58it/s] 

Batch 43 saved with 1000000 sequences.


Creating LSTM Sequences: 100%|██████████| 44382897/44382897 [20:42<00:00, 35717.99it/s]


Final batch 44 saved with 382897 sequences.


In [20]:
combined_sequences = []

for i in range(batch_index + 1):  # Includes the final batch
    with np.load(f'city_A_batch_{i}.npz') as data:
        combined_sequences.extend(data['sequences'])

# Convert to a single NumPy array if needed
combined_sequences = np.array(combined_sequences)
np.savez('city_A_processed_data_optimized.npz', sequences=combined_sequences)
print("All batches combined and saved.")

All batches combined and saved.


In [21]:
np.savez('city_A_processed_data_optimized.npz', sequences=final_sequences)

In [22]:
# for city b

# load data
mobility_data = pd.read_csv("hiroshima_challengedata.csv")

# Load POI distribution data
poi_data = pd.read_csv("POIdata_cityB.csv")

# Load POI category mappings
poi_categories = pd.read_csv("POI_datacategories.csv", header=None, names=['category_name'])
poi_categories['category_id'] = range(1, len(poi_categories) + 1)

# Load Task 1 output (frequent itemsets)
task1_output = pd.read_csv("frequent_itemsets_cityB.csv")

# Load Task 2 output (frequent patterns)
task2_patterns = pd.read_csv('hiroshima_freq_subseq.csv')['Patterns'].apply(eval).tolist()


In [23]:
# Merge POI categories with distribution data
poi_data = pd.merge(poi_data, poi_categories, left_on='category', right_on='category_id', how='left')

# Filter mobility data to the first 30 days
mobility_data['d'] = pd.to_numeric(mobility_data['d'], errors='coerce')
mobility_data = mobility_data[mobility_data['d'] <= 30]

# Remove rows with missing coordinates
mobility_data = mobility_data[~((mobility_data['x'] == -999) & (mobility_data['y'] == -999))]

# Calculate frequent support for each POI using Task 1
def calculate_frequent_support(poi_category, task1_data):
    return task1_data.loc[
        task1_data['itemsets'].apply(lambda s: poi_category in s), 'support'
    ].sum()

poi_data['frequent_support'] = poi_data['category_name'].apply(
    lambda x: calculate_frequent_support(x, task1_output)
)

# Aggregate POI data by grid
poi_aggregated = poi_data.groupby(['x', 'y']).agg(
    total_poi_count=('POI_count', 'sum'),
    avg_frequent_support=('frequent_support', 'mean')
).reset_index()

# Merge POI features with mobility data
mobility_data = pd.merge(
    mobility_data, poi_aggregated,
    on=['x', 'y'], how='left'
).fillna(0)  # Fill missing POI features with 0

# Precompute POI features as a NumPy array
poi_features = ['total_poi_count', 'avg_frequent_support']
poi_features_array = mobility_data[poi_features].to_numpy()

# Sort the entire dataset once by user and time
mobility_data = mobility_data.sort_values(['uid', 'd', 't']).reset_index(drop=True)

# Convert task2_patterns to a set of tuples for fast lookup
task2_patterns_set = set(map(tuple, task2_patterns))

# Function to check if a sequence matches a pattern (optimized for set lookup)
def match_pattern(sequence, patterns_set):
    return 1 if tuple(map(tuple, sequence)) in patterns_set else 0

# Function to process a single sequence for multiprocessing
def process_sequence(data):
    seq, match, poi_features_array = data
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match]) for i, step in enumerate(seq)
    ]
    return enriched_seq

# Group by user
grouped = mobility_data.groupby('uid')

# Initialize list for enhanced sequences
enhanced_sequences = []
# Sequence length for LSTM
sequence_length = 10

In [24]:
# Process each group with tqdm for progress tracking
for _, group in tqdm(grouped, desc="Processing User Groups"):
    # Convert group to NumPy array for faster slicing
    group_array = group[['x', 'y']].to_numpy()

    # Generate sequences
    num_sequences = len(group_array) - sequence_length
    if num_sequences > 0:
        sequences = np.lib.stride_tricks.sliding_window_view(group_array, (sequence_length, group_array.shape[1]))[:, 0]
        matches = [match_pattern(seq, task2_patterns_set) for seq in sequences]
        enhanced_sequences.extend(zip(sequences, matches))

Processing User Groups: 100%|██████████| 24906/24906 [00:42<00:00, 591.89it/s]


In [25]:
# Normalize features
max_x, max_y = mobility_data['x'].max(), mobility_data['y'].max()
mobility_data['x'] = mobility_data['x'] / max_x
mobility_data['y'] = mobility_data['y'] / max_y

poi_features = ['total_poi_count', 'avg_frequent_support']
mobility_data[poi_features] = mobility_data[poi_features] / mobility_data[poi_features].max()

In [26]:
final_sequences = []
for seq, match in tqdm(enhanced_sequences, desc="Creating LSTM Sequences"):
    # Combine sequence with precomputed POI features and pattern match
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match])  # Use preloaded features
        for i, step in enumerate(seq)
    ]
    final_sequences.append(enriched_seq)

final_sequences = np.array(final_sequences)

Creating LSTM Sequences: 100%|██████████| 9859291/9859291 [04:12<00:00, 39047.83it/s]


In [27]:
np.savez('city_B_processed_data_optimized.npz', sequences=final_sequences)

In [28]:

# for city c

# load data
mobility_data = pd.read_csv("sapporo_challengedata.csv")

# Load POI distribution data
poi_data = pd.read_csv("POIdata_cityC.csv")

# Load POI category mappings
poi_categories = pd.read_csv("POI_datacategories.csv", header=None, names=['category_name'])
poi_categories['category_id'] = range(1, len(poi_categories) + 1)

# Load Task 1 output (frequent itemsets)
task1_output = pd.read_csv("frequent_itemsets_cityC.csv")

# Load Task 2 output (frequent patterns)
task2_patterns = pd.read_csv('sapporo_freq_subseq.csv')['Patterns'].apply(eval).tolist()


In [29]:
# Merge POI categories with distribution data
poi_data = pd.merge(poi_data, poi_categories, left_on='category', right_on='category_id', how='left')

# Filter mobility data to the first 30 days
mobility_data['d'] = pd.to_numeric(mobility_data['d'], errors='coerce')
mobility_data = mobility_data[mobility_data['d'] <= 30]

# Remove rows with missing coordinates
mobility_data = mobility_data[~((mobility_data['x'] == -999) & (mobility_data['y'] == -999))]

# Calculate frequent support for each POI using Task 1
def calculate_frequent_support(poi_category, task1_data):
    return task1_data.loc[
        task1_data['itemsets'].apply(lambda s: poi_category in s), 'support'
    ].sum()

poi_data['frequent_support'] = poi_data['category_name'].apply(
    lambda x: calculate_frequent_support(x, task1_output)
)

# Aggregate POI data by grid
poi_aggregated = poi_data.groupby(['x', 'y']).agg(
    total_poi_count=('POI_count', 'sum'),
    avg_frequent_support=('frequent_support', 'mean')
).reset_index()

# Merge POI features with mobility data
mobility_data = pd.merge(
    mobility_data, poi_aggregated,
    on=['x', 'y'], how='left'
).fillna(0)  # Fill missing POI features with 0

# Precompute POI features as a NumPy array
poi_features = ['total_poi_count', 'avg_frequent_support']
poi_features_array = mobility_data[poi_features].to_numpy()

# Sort the entire dataset once by user and time
mobility_data = mobility_data.sort_values(['uid', 'd', 't']).reset_index(drop=True)

# Convert task2_patterns to a set of tuples for fast lookup
task2_patterns_set = set(map(tuple, task2_patterns))

# Function to check if a sequence matches a pattern (optimized for set lookup)
def match_pattern(sequence, patterns_set):
    return 1 if tuple(map(tuple, sequence)) in patterns_set else 0

# Function to process a single sequence for multiprocessing
def process_sequence(data):
    seq, match, poi_features_array = data
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match]) for i, step in enumerate(seq)
    ]
    return enriched_seq

# Group by user
grouped = mobility_data.groupby('uid')

# Initialize list for enhanced sequences
enhanced_sequences = []
# Sequence length for LSTM
sequence_length = 10

In [30]:
# Process each group with tqdm for progress tracking
for _, group in tqdm(grouped, desc="Processing User Groups"):
    # Convert group to NumPy array for faster slicing
    group_array = group[['x', 'y']].to_numpy()

    # Generate sequences
    num_sequences = len(group_array) - sequence_length
    if num_sequences > 0:
        sequences = np.lib.stride_tricks.sliding_window_view(group_array, (sequence_length, group_array.shape[1]))[:, 0]
        matches = [match_pattern(seq, task2_patterns_set) for seq in sequences]
        enhanced_sequences.extend(zip(sequences, matches))

Processing User Groups: 100%|██████████| 19950/19950 [00:35<00:00, 560.96it/s]


In [31]:
# Normalize features
max_x, max_y = mobility_data['x'].max(), mobility_data['y'].max()
mobility_data['x'] = mobility_data['x'] / max_x
mobility_data['y'] = mobility_data['y'] / max_y

poi_features = ['total_poi_count', 'avg_frequent_support']
mobility_data[poi_features] = mobility_data[poi_features] / mobility_data[poi_features].max()

In [32]:
final_sequences = []
for seq, match in tqdm(enhanced_sequences, desc="Creating LSTM Sequences"):
    # Combine sequence with precomputed POI features and pattern match
    enriched_seq = [
        np.hstack([step, poi_features_array[i], match])  # Use preloaded features
        for i, step in enumerate(seq)
    ]
    final_sequences.append(enriched_seq)

final_sequences = np.array(final_sequences)

Creating LSTM Sequences: 100%|██████████| 7517188/7517188 [03:17<00:00, 38102.04it/s]


In [31]:
np.savez('city_C_processed_data_optimized.npz', sequences=final_sequences)