In [4]:
import pandas as pd
DF_COLUMNS = ['User ID',
            'Venue ID',
            'Venue Category ID',
            'Venue Category Name',
            'Latitude',
            'Longitude',
            'Timezone Offset',
            'UTC Time']
df = pd.read_csv('data/FNYC/raw_files/dataset_tsmc2014/dataset_TSMC2014_NYC.txt',  sep='\t', encoding='latin-1', names=DF_COLUMNS)
df = df.sort_values(by=['User ID'])
df = df.drop_duplicates()
df.shape[0]

227178

In [7]:

def collect_unique_venue_locations(dataframe):
    """
    Collect all different locations assigned to each unique venue across all records.

    Args:
        dataframe (pd.DataFrame): The input dataframe with venue check-in data.

    Returns:
        dict: A dictionary where keys are Venue IDs and values are lists of unique locations (latitude, longitude).
    """
    # Group by 'Venue ID' and aggregate unique locations (Latitude, Longitude)
    venue_locations = (
        dataframe.groupby('Venue ID', group_keys=False)
        .apply(lambda group: group[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True).values.tolist())
        .to_dict()
    )
    return venue_locations

# Example usage
unique_venue_locations = collect_unique_venue_locations(df)

# Display venues with multiple locations (if any)
venues_with_multiple_locations = {venue: locations for venue, locations in unique_venue_locations.items() if len(locations) > 1}

print(len(venues_with_multiple_locations))

3484


  .apply(lambda group: group[['Latitude', 'Longitude']].drop_duplicates().reset_index(drop=True).values.tolist())


In [14]:
from itertools import combinations
from math import radians, sin, cos, sqrt, atan2

def collect_unique_venue_locations(dataframe):
    """
    Collect all different locations assigned to each unique venue across all records.

    Args:
        dataframe (pd.DataFrame): The input dataframe with venue check-in data.

    Returns:
        dict: A dictionary where keys are Venue IDs and values are lists of unique locations (latitude, longitude).
    """
    # Group by 'Venue ID' and aggregate unique locations (Latitude, Longitude)
    venue_locations = (
        dataframe.groupby('Venue ID')
        .apply(lambda group: group[['Latitude', 'Longitude']].drop_duplicates().values.tolist())
        .to_dict()
    )
    return venue_locations

def haversine_distance(loc1, loc2):
    """
    Calculate the great-circle distance between two points on the Earth using the Haversine formula.

    Args:
        loc1 (tuple): (latitude, longitude) of the first location in decimal degrees.
        loc2 (tuple): (latitude, longitude) of the second location in decimal degrees.

    Returns:
        float: Distance between the two points in kilometers.
    """
    # Radius of the Earth in kilometers
    R = 6371.0

    # Convert latitude and longitude from degrees to radians
    lat1, lon1 = radians(loc1[0]), radians(loc1[1])
    lat2, lon2 = radians(loc2[0]), radians(loc2[1])

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    # Distance
    return R * c * 1000

def calculate_longest_distance(venue_locations):
    """
    Calculate the longest great-circle distance between different locations assigned to each unique venue.

    Args:
        venue_locations (dict): A dictionary where keys are Venue IDs and values are lists of unique locations (latitude, longitude).

    Returns:
        dict: A dictionary where keys are Venue IDs and values are the longest great-circle distance (if more than one location exists).
    """
    longest_distances = {}

    for venue, locations in venue_locations.items():
        if len(locations) > 1:
            # Calculate all pairwise distances using the Haversine formula
            distances = [haversine_distance(loc1, loc2) for loc1, loc2 in combinations(locations, 2)]
            longest_distances[venue] = max(distances)
        else:
            longest_distances[venue] = 0  # No distance if only one location

    return longest_distances

def get_sorted_venues_by_distance(longest_distances):
    """
    Sort venues by their longest distances in descending order.

    Args:
        longest_distances (dict): A dictionary where keys are Venue IDs and values are the longest Euclidean distances.

    Returns:
        list: A list of tuples sorted by longest distance in descending order (Venue ID, Longest Distance).
    """
    return sorted(longest_distances.items(), key=lambda x: x[1], reverse=True)

# Example usage
unique_venue_locations = collect_unique_venue_locations(df)
longest_distances = calculate_longest_distance(unique_venue_locations)
sorted_venues = get_sorted_venues_by_distance(longest_distances)

  .apply(lambda group: group[['Latitude', 'Longitude']].drop_duplicates().values.tolist())


In [15]:
i = 0
for venue, distance in sorted_venues:
    print(venue, distance)
    i+=1
    if i == 10:
        break

4ddad40bd22d4dbc8c0d4f91 20875.184303474245
4be2144d21d5a59302ca1511 19864.869301420516
4f386335e4b08f009a8525de 16384.124820247493
4c7170dafa49a1cd60e6a8e3 15937.920618432663
4a8c0960f964a520e50c20e3 14393.90774926774
4f1708fae4b0044a28cbe14f 11763.24908652051
4e0e0b3caeb7a5b33ee5dac1 7748.079960620895
4e7b9f93b61c001c6b38f13f 7233.508154016825
4e51dcc76284416669b03aec 6345.832450303119
4dc7d9d81f6ef43b8a4e609a 5353.036771362292


In [1]:
!python train.py

Dataset already downloaded!
Dataset already extracted!
Dateset statistics before filtering:
Number of users: 1083, with min = 100, max = 2693, and avg: 209.76731301939057
Number of venues: 38333, with min = 1, max = 1145, and avg: 5.926434142905591
Number of venue categories: 400
Dateset statistics after filtering:
Number of users: 1081, with min = 20, max = 2511, and avg: 136.6096207215541
Number of venues: 5128, with min = 10, max = 1145, and avg: 28.797776911076443
Number of venue categories: 320
Spatial graph sparsity: 0.9934740076810561
Temporal graph sparsity: 0.9182665190772997
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name          | Type             | Params | Mode 
------------------------------------------------------------
0  | user_emb      | Embedding        | 553 K  | train
1  | poi_emb       | Embedding        | 2.6 M  | train
2  | emb_dropout   | Dr

In [1]:
from src.dataset import FoursquareNYC
from src.models import TrajLSTM

ds = FoursquareNYC(num_workers=1)
ds.setup('fit')
ds.setup('test')
# model = TrajLSTM(num_user=ds.STATS['num_user'],
#                  num_pois=ds.STATS['num_pois'])

Dataset already downloaded!
Dataset already extracted!
Dateset statistics before filtering:
Number of records: 227178
Number of users: 1083, with min = 100, max = 2693, and avg: 209.76731301939057
Number of venues: 38333, with min = 1, max = 1145, and avg: 5.926434142905591
Number of venue categories: 400
Dateset statistics after filtering:
Number of records: 147675
Number of users: 1081, with min = 20, max = 2511, and avg: 136.6096207215541
Number of venues: 5128, with min = 10, max = 1145, and avg: 28.797776911076443
Number of venue categories: 320
1796 19864.869301420516
3745 16384.124820247493
3469 15937.920618432663
3973 14393.90774926774
3327 7233.508154016825
1 0
2 0
3 0
4 0
5 0
num venues with more than 200 min check-ins 0


NameError: name 'exit' is not defined

In [2]:
ds.STATS

{'num_user': 1081,
 'num_pois': 5128,
 'num_poi_cat': 320,
 'num_time_slots': 56,
 'num_gh_P5': 115,
 'num_gh_P6': 1175,
 'num_gh_P7': 3548}

In [2]:
ds.user_train_trajectories.iloc[0]

User ID                                                              1
Venue ID             [2, 15, 12, 13, 23, 16, 14, 20, 7, 7, 13, 25, ...
Venue Category ID    [2, 14, 12, 10, 20, 15, 13, 17, 7, 7, 10, 6, 1...
Geohash P5 ID        [1, 2, 3, 3, 2, 2, 2, 2, 2, 2, 3, 4, 3, 5, 6, ...
Geohash P6 ID        [1, 2, 3, 3, 4, 5, 6, 7, 7, 7, 3, 8, 9, 10, 11...
Geohash P7 ID        [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 4, 10, 11, 12, ...
Local Time           [2012-04-07 13:42:24, 2012-04-08 14:20:29, 201...
Time Slot            [45, 53, 5, 29, 40, 41, 45, 47, 47, 47, 30, 31...
Unix Timestamp       [1333806144, 1333894829, 1333974052, 133423676...
Name: 0, dtype: object

In [None]:

dl_train = ds.train_dataloader()
dl_test = ds.test_dataloader()
x, y, lens = next(iter(dl_train))

# model.validation_step(batch)

torch.Size([32, 63])


In [None]:
import ipywidgets as widgets
import numpy as np
import torch
from IPython.display import display
widget = widgets.Output()
# with widget:
#     display(ds.poi_trajectories)
# print(ds.user_train_trajectories.iloc[400])
ds.setup(stage='fit')
ds.setup(stage='test')
dl_train = ds.train_dataloader()
dl_test = ds.test_dataloader()
x, y, lens = next(iter(dl_train)) 
user_ids = x[0]
pois = x[1]
seq_len = pois.size(1)
mask = torch.arange(seq_len).expand(len(lens), seq_len) < lens.unsqueeze(1)
expanded_user_ids = user_ids.unsqueeze(1).repeat(1, pois.size(1))
print(lens)
print(expanded_user_ids.shape)
print(pois.shape)
# print(expanded_user_ids.shape)
print(expanded_user_ids * mask)
# print(pois.shape)
# print(y[1][0])
# print(y[1].reshape(-1))
# print(lens)
# print(x[1].shape)
# print(next(iter(dl_test)))
# for batch in dl_test:
#     pass

# display(widget)

# lengths1 = np.array(ds.user_train_trajectories['Time Slot'].apply(len))
# lengths2 = np.array(ds.user_train_trajectories['Venue ID'].apply(len))
# lengths3 = np.array(ds.user_train_trajectories['Geohash ID'].apply(len))

# ds.plot_distribution(lengths)
