In [37]:
from model.recommender import UserKNN, MatrixFactorization
from model import user
from datetime import timedelta
import glob
import tqdm
import os
import pandas as pd
import numpy as np

def filtering_training_data(training_dataset, min_activity):
    # Count the number of rows per user
    user_counts = training_dataset['uid'].value_counts()
    # Get the users with at least x rows
    valid_users = user_counts[user_counts >= min_activity].index
    # Filter the DataFrame to include only valid users
    filtered_dataset = training_dataset[training_dataset['uid'].isin(valid_users)]
    return filtered_dataset

def create_interaction_matrix(data):
    interaction_counts = data.groupby(['uid', 'venueID']).size().reset_index(name='counts')
    interaction_matrix = interaction_counts.pivot(index='uid', columns='venueID', values='counts').fillna(0)
    return interaction_matrix


def calculate_metrics(recommended_venues, real_visited_venues):
    recommended_venues = list(recommended_venues)
    real_visited_venues = set(real_visited_venues)
    
    # Calculate hits
    hits = [1 if venue in real_visited_venues else 0 for venue in recommended_venues]
    
    # Precision
    precision = sum(hits) / len(recommended_venues) if recommended_venues else 0
    
    # Recall
    recall = sum(hits) / len(real_visited_venues) if real_visited_venues else 0
    
    # NDCG
    dcg = 0
    idcg = 0
    for i, hit in enumerate(hits, 1):
        if hit:
            dcg += 1 / np.log2(i + 1)
    for i in range(1, len(real_visited_venues) + 1):
        idcg += 1 / np.log2(i + 1)
    ndcg = dcg / idcg if idcg > 0 else 0
    
    # MRR
    mrr = 0
    for i, hit in enumerate(hits, 1):
        if hit:
            mrr = 1 / i
            break
    
    # Hit Rate
    hit_rate = 1 if any(hits) else 0
    
    return {
        'ndcg': ndcg,
        'precision': precision,
        'recall': recall,
        'mrr': mrr,
        'hitrate': hit_rate
    }
    

In [51]:
# DATASET HYPER-PARAMETERS #
city = "chicago"
city_files = glob.glob('*_visits.csv')
data_file = f"./data/processed/{city}_visits.csv"
train_window = 60  # Training window in days
k_days = 7  # Epoch length in days
threshold = 100  # Performance degradation threshold (adjust as needed)
max_simulation_days = 90  # Maximum number of simulation days
min_activity = 5  
topK = 20

# RECSYS HYPER-PARAMETERS #
# MF
num_latent_factors = 32
# UserKNN
num_nearest_neighbors = 5


In [26]:
# DATA LOADING
dataset = pd.read_csv(data_file)
print(f'Dataset has {len(dataset)} records.')
dataset['time'] = pd.to_datetime(dataset['time'])
dataset['uid'] = dataset['uid'].astype(int)
dataset['venueID'] = dataset['venueID'].astype(str)
#drop duplicates
dataset = dataset.drop_duplicates()
# self.city_name = os.path.basename(data_file).split('_')[0]
print(f'Dataset has {len(dataset)} records after removing duplicates.')
# Set training period
start_date = dataset['time'].min()
training_duration = timedelta(days=train_window)
t = start_date + training_duration
# Training data
train_data = dataset[dataset['time'] < t]
# Filtering out users with few interactions (visited less than X POIs)
train_data = filtering_training_data(train_data, min_activity)
# Simulation data==Test data
test_data = dataset[dataset['time'] >= t]
# Excluding users not present in the training dataset
test_data = test_data[test_data['uid'].isin(train_data['uid'].unique())]
test_data = test_data[test_data['venueID'].isin(train_data['venueID'].unique())]
# Prepare interaction matrix
X_train = create_interaction_matrix(train_data)


Dataset has 167702 records.
Dataset has 167641 records after removing duplicates.


In [27]:
recommender = MatrixFactorization(num_latent_factors=num_latent_factors)
recommender.fit(X_train)


In [28]:
print(f'Test dataset cardinality: {len(test_data)}')
# Creating a unique key in both DataFrames by concatenating 'uid' and 'venueID'
train_data['key'] = train_data['uid'].astype(str) + '-' + train_data['venueID']
test_data['key'] = test_data['uid'].astype(str) + '-' + test_data['venueID']

# Identifying keys in train_data
train_keys = set(train_data['key'])

# Filtering test_data to exclude rows with keys that are in train_keys
test_data = test_data[~test_data['key'].isin(train_keys)]

# Dropping the key column as it's no longer needed
test_data = test_data.drop(columns='key')
train_data = train_data.drop(columns='key')
print(f'Test dataset cardinality: {len(test_data)}')


Test dataset cardinality: 56174
Test dataset cardinality: 29124


In [52]:
hitrate_list = list()
mrr_list = list()
for uid in tqdm.tqdm(test_data['uid'].unique(), 'Users'):
    # Get visited venueIDs
    test_venueIDs = test_data[test_data['uid'] == uid]['venueID'].values
    # Create user object
    user_object = user.User(uid, pd.Series(data={'venueID': train_data[train_data['uid'] == uid].venueID.unique()}), None)
    recommended_venues = recommender.return_topK(user_object, topK)
    metrics_dict = calculate_metrics(recommended_venues, test_venueIDs)
    hitrate_list.append(metrics_dict['hitrate'])
    mrr_list.append(metrics_dict['mrr'])
    

Users: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1024/1024 [00:09<00:00, 111.35it/s]


In [53]:
np.mean(hitrate_list)


0.2626953125