In [50]:
import polars as pl 
import numpy as np
import torch
import implicit # For BPR

In [65]:
train_len = 868_798
val_len = 113_979
test_len = 113_979
total_len = train_len + val_len + test_len

## Data Loading

In [103]:
def get_train_valid_eval(): 
    """
    Returns datasets sorted by user_id with idx of their initial position in the dataset
    - X_train: All interactions except the last two per user
    - y_valid: Second to last interaction per user
    - y_test: Last interaction per user
    """
    exclude_cols = ["hours", "products", "found_funny", "hours", "text"]
    full = pl.scan_parquet("data/sorted_reviews.parquet").drop(exclude_cols).with_row_index("idx")
    y = full.group_by("mapped_user_id").tail(2)
    X_train = full.join(y, on="idx", how="anti") # Up to two interactions from last per user

    y_valid = y.group_by("mapped_user_id").head(1) # 1 interaction from last per user
    y_test = y.group_by("mapped_user_id").tail(1) # Last interaction per user

    assert total_len == len(full.collect())
    assert train_len == len(X_train.collect())
    assert val_len == len(y_valid.collect())
    assert test_len == len(y_test.collect())

    return X_train, y_valid, y_test

In [104]:
def format_train_eval_data(df): 
    """
    Formats the data to have the columns: idx, user_id, product_id
    """
    exclude_cols = ["review_date", "mapped_user_id", "mapped_product_id"]
    return (
        df
        .with_columns(
            pl.col("mapped_user_id").alias("user_id"),
            pl.col("mapped_product_id").alias("product_id"),
        )
        .drop(exclude_cols)
        .select(["idx", "user_id", "product_id"])
    )

In [71]:
def get_clean_train_valid_eval(): 
    """ 
    Returns the datasets formatted for training and evaluation
    - X_train: All interactions except the last two per user
    - y_valid: Second to last interaction per user
    - y_test: Last interaction per user
    """
    X_train, y_valid, y_test = get_train_valid_eval()
    X_train = format_train_eval_data(X_train)
    y_valid = format_train_eval_data(y_valid)
    y_test = format_train_eval_data(y_test)
    assert total_len == len(X_train.collect()) + len(y_valid.collect()) + len(y_test.collect())
    return X_train, y_valid, y_test

In [106]:
def join_train_valid(X_train, y_valid):
    """
    Joins the training and validation datasets on each user 
    - X_test: All interactions except the last one per user
    """
    X_test = pl.concat([X_train, y_valid], how="vertical").sort("idx")
    assert train_len + val_len == len(X_test.collect())
    return X_test

## Experiments

### Experiment 1: Popularity Baseline

In [130]:
def get_k_most_popular_items(X, k=50): 
    """
    Returns the k most popular items in X
    """
    popular_items = (
        X
        .group_by("product_id")
        .agg(pl.count("product_id").alias("count"))
        .sort("count", descending=True)
        .head(k)
    )
    return popular_items

In [136]:
def predict_based_on_item_popularity(X_train, y_valid, y_test, k=50): 
    """
    Predicts the most popular items for each user
    """
    top_k_items = get_k_most_popular_items(
        join_train_valid(X_train, y_valid),
        k)
    return y_test.join(top_k_items, on="product_id")

In [170]:
def popularity_baseline(k_vals = [3, 5, 10, 25, 50]):
    X_train, y_valid, y_test = get_clean_train_valid_eval()
    
    hits_at_k = []
    precision_at_k = []

    for k in k_vals:
        y_pred = predict_based_on_item_popularity(X_train, y_valid, y_test, k = k)
        hits = len(y_pred.join(y_test, on=["user_id", "product_id"]).collect())

        num_predictions = len(y_pred.collect())
        number_of_users = len(y_test.collect()) 

        hits_at_k.append(hits / number_of_users)

        # Number of hits divided by number of predictions
        precision_at_k.append(hits / k / number_of_users)

    results = pl.DataFrame(
        {
            "k": k_vals,
            "hits_at_k": hits_at_k,
            "precision_at_k": precision_at_k
        }
    )

    return results

In [171]:
print(popularity_baseline())

shape: (5, 3)
┌─────┬───────────┬────────────────┐
│ k   ┆ hits_at_k ┆ precision_at_k │
│ --- ┆ ---       ┆ ---            │
│ i64 ┆ f64       ┆ f64            │
╞═════╪═══════════╪════════════════╡
│ 3   ┆ 0.027584  ┆ 0.009195       │
│ 5   ┆ 0.035945  ┆ 0.007189       │
│ 10  ┆ 0.056581  ┆ 0.005658       │
│ 25  ┆ 0.111652  ┆ 0.004466       │
│ 50  ┆ 0.166776  ┆ 0.003336       │
└─────┴───────────┴────────────────┘


### Experiment 2: Item2Vec


### Experiment 3: Bayesian Personalized Ranking