In [None]:
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree
from pyproj import Proj
import torch
from torch.utils.data import Dataset

# Constants
RAW_SEQ_LEN = 50
PAD = 0
K = 5
DISTANCE_THRESHOLD = 100

def split_indices_for_next_prediction(df):
    grouped = df.groupby('user_id').cumcount()
    total_len = len(df)
    train_end = int(total_len * 0.7)
    val_end = int(total_len * 0.85)

    train_indices = pd.DataFrame({'first_idx': range(0, train_end), 'last_idx': range(0, train_end)}).iloc[::RAW_SEQ_LEN]
    val_indices = pd.DataFrame({'first_idx': range(train_end, val_end), 'last_index': range(train_end, val_end)}).iloc[::RAW_SEQ_LEN]
    test_indices = pd.DataFrame({'first_idx': range(val_end, total_len), 'last_idx': range(val_end, total_len)}).iloc[::RAW_SEQ_LEN]

    return train_indices, val_indices, test_indices

# 1. Load the Data
poi_cat_vecs_df = pd.read_parquet("/Users/mehul/Downloads/novateur.phase2.trial4/poi_cat_vecs.parquet")
poi_df = pd.read_csv("/Users/mehul/Downloads/novateur.phase2.trial4/poi.csv")
stay_poi_df = pd.read_parquet("/Users/mehul/Downloads/novateur.phase2.trial4/stay_poi_dfs/group=0/stay_poi.parquet")

# 2. Project Coordinates to UTM
def project_to_utm(df, lat_col = 'latitude', lon_col = 'longitude'):
    zone_number = int((df[lon_col].mean() + 180) // 6) + 1
    utm_proj = Proj(proj='utm', zone=zone_number, ellps='WGS84')
    df['x'], df['y'] = utm_proj(df[lon_col].values, df[lat_col].values)
    return df

stay_poi_df = project_to_utm(stay_poi_df, lat_col='latitude', lon_col='longitude')
poi_df = project_to_utm(poi_df, lat_col='latitude', lon_col='longitude')

# 3. Calculate Duration (in Hours)
stay_poi_df['arrival_time'] = pd.to_datetime(stay_poi_df['arrival_time'])
stay_poi_df['departure_time'] = pd.to_datetime(stay_poi_df['departure_time'])
stay_poi_df['duration'] = (stay_poi_df['departure_time'] - stay_poi_df['arrival_time']).dt.total_seconds() / 3600



In [19]:
poi_cat_vecs_df.iloc[0]

BuyGoods           True
ChildCare         False
DropOff           False
EatOut            False
Errands           False
Exercise          False
HealthCare        False
Home              False
Recreation         True
Religious         False
School            False
Services           True
SomethingElse     False
Transportation    False
Visit             False
Work              False
Name: 0, dtype: bool

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
from pyproj import Proj

In [18]:
### Cordinate Projection

# Finding UTM Zones using longitudes
def get_utm_zone(longitude):
    return int((np.floor((longitude + 180) / 6) % 60) + 1)

# Project latitude and longitude in the dataframe to UTM x,y coordinates.
def project_latlon_to_x_y(df, lat_col='latitude', lon_col='longitude'):
    # Use the centroid of the coordinates to decide the projection zone
    lon_c = df.lon_col.mean()
    zone_number = get_utm_zone(lon_c)
    proj_utm = Proj(proj='utm', zone=zone_number, ellps='WGS84')
                    
    # Project the lon/lat arrays to UTM x and y coordinates
    x, y = proj_utm(df[lon_col].values, df[lat_col].values)
    df['x'] = x - np.mean(x)  # center the x-values
    df['y'] = y - np.mean(y)
    return df


In [19]:
def clean_stay_poi(df):
    df['start_timestamp'] = pd.to_datetime(df['start_timestamp'])
    df['stop_timestamp'] = pd.to_datetime(df['stop_timestamp'])
                                         
    # Convert latitude/longitude to float
    df['latitude']  = pd.to_numeric(df['latitude'], errors='coerce')
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

    # Drop rows with missing values in key fields
    df.dropna(subset=['latitude', 'longitude', 'start_timestamp', 'stop_timestamp'], inplace=True)

    # Compute duration in minutes
    df['duration_minutes'] = (df['stop_timestamp'] - df['start_timestamp']) \
                            .dt.total_seconds() / 60

    # Filter out stays that are too short (e.g., less than 5 minutes)
    df = df[df['duration_minutes'] >= 5]

    # Project lat/lon into UTM coordinates
    df = project_latlon_to_x_y(df, lat_col='latitude', lon_col='longitude')

    return df


In [17]:
demographics_df.iloc[1]

Unnamed: 0                        1
id                                1
household_id                      1
household_member_id               0
age                              83
sex                          FEMALE
household_role                 HEAD
emp_status                     NONE
industry                       NONE
income_percentile              78.3
religion                     SHINTO
num_cars_in_household             1
spouse_id                        -1
parent_ids                       []
child_ids                        []
home_latitude             35.311655
home_longitude           139.316789
agent                         44586
gender                            1
married                       False
children                      False
__index_level_0__                 1
Name: 1, dtype: object

In [18]:
stay_df.head()

Unnamed: 0,agent,id,latitude,longitude,start_timestamp,stop_timestamp
0,0,0,35.551188,139.568407,2024-10-15 15:00:00+00:00,2024-10-15 20:25:00+00:00
1,0,1,35.523382,139.590033,2024-10-15 20:30:40+00:00,2024-10-15 20:53:00+00:00
2,0,2,35.60559,139.515227,2024-10-15 21:04:50+00:00,2024-10-16 13:55:40+00:00
3,0,3,35.551188,139.568407,2024-10-16 14:03:40+00:00,2024-10-16 21:47:40+00:00
4,0,4,35.60559,139.515227,2024-10-16 21:55:40+00:00,2024-10-17 03:55:20+00:00


In [19]:
stay_poi_df.head()

Unnamed: 0,agent,id,latitude,longitude,start_timestamp,stop_timestamp,poi_id,act,act_name
0,0,0,35.551188,139.568407,2024-10-15 15:00:00+00:00,2024-10-15 20:25:00+00:00,162296.0,1,Home
1,0,1,35.523382,139.590033,2024-10-15 20:30:40+00:00,2024-10-15 20:53:00+00:00,46682.0,10,Exercise
2,0,2,35.60559,139.515227,2024-10-15 21:04:50+00:00,2024-10-16 13:55:40+00:00,52496.0,2,Work
3,0,3,35.551188,139.568407,2024-10-16 14:03:40+00:00,2024-10-16 21:47:40+00:00,162296.0,1,Home
4,0,4,35.60559,139.515227,2024-10-16 21:55:40+00:00,2024-10-17 03:55:20+00:00,52496.0,2,Work


In [20]:
trip_df.head()

Unnamed: 0,agent,id,start_timestamp,origin_latitude,origin_longitude,stop_timestamp,destination_latitude,destination_longitude
0,0,0,2024-10-15 20:25:00+00:00,35.551188,139.568407,2024-10-15 20:30:40+00:00,35.523382,139.590033
1,0,1,2024-10-15 20:53:00+00:00,35.523382,139.590033,2024-10-15 21:04:50+00:00,35.60559,139.515227
2,0,2,2024-10-16 13:55:40+00:00,35.60559,139.515227,2024-10-16 14:03:40+00:00,35.551188,139.568407
3,0,3,2024-10-16 21:47:40+00:00,35.551188,139.568407,2024-10-16 21:55:40+00:00,35.60559,139.515227
4,0,4,2024-10-17 03:55:20+00:00,35.60559,139.515227,2024-10-17 03:57:40+00:00,35.603707,139.509709
