In [1]:
import polars as pl 
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import implicit # For BPR
from scipy.sparse import coo_matrix, csr_matrix

  from .autonotebook import tqdm as notebook_tqdm


## Helpers

In [2]:
train_len = 715613
val_len = 92112
test_len = 92112
total_len = train_len + val_len + test_len
assert total_len == 899837
num_users = 113_979
num_items = 8_609
k_vals = [3, 5, 10, 25, 50]
num_items = 64
SEED = 42

def build_results_df(model_name, k_vals, hits_at_k, precision_at_k):
    """
    Returns a DataFrame with the results of the model, given lists of k_vals, hits_at_k, and precision_at_k
    """
    return pl.DataFrame(
        {
            "model": [model_name] * len(k_vals),
            "k": k_vals,
            "hits_at_k": hits_at_k,
            "precision_at_k": precision_at_k
        }
    )

## Data Loading

In [30]:
def load_full_data():
    """
    Returns the full dataset sorted by user_id
    """
    exclude_cols = ["hours", "products", "found_funny", "hours", "text"]
    df = pl.scan_parquet("data/reviews.parquet")
    print(df.collect())

def load_train_val_test_data(): 
    """
    Returns datasets sorted by user_id with idx of their initial position in the dataset
    - X_train: All interactions except the last two per user
    - y_valid: Second to last interaction per user
    - y_test: Last interaction per user
    """
    exclude_cols = ["hours", "products", "found_funny", "hours", "text"]
    full = pl.scan_parquet("data/sorted_reviews.parquet").drop(exclude_cols).with_row_index("idx")
    users_with_less_than_2_interactions = full.group_by("mapped_user_id").agg(pl.count("mapped_product_id").alias("interaction_count"))
    users_with_less_than_2_interactions = users_with_less_than_2_interactions.filter(pl.col("interaction_count") < 5)

    y = full.group_by("mapped_user_id").tail(2)
    X_train = full.join(y, on="idx", how="anti") # Up to two interactions from last per user

    y_valid = y.group_by("mapped_user_id").head(1) # 1 interaction from last per user
    y_test = y.group_by("mapped_user_id").tail(1) # Last interaction per user

    assert total_len == len(full.collect())
    assert train_len == len(X_train.collect())
    assert val_len == len(y_valid.collect())
    assert test_len == len(y_test.collect())

    return X_train, y_valid, y_test

def format_train_eval_data(df, cols_to_keep=["idx", "user_id", "product_id"]): 
    """
    Formats the data to have the columns: idx, user_id, product_id
    """
    exclude_cols = ["review_date", "mapped_user_id", "mapped_product_id"]
    return (
        df
        .with_columns(
            pl.col("mapped_user_id").alias("user_id"),
            pl.col("mapped_product_id").alias("product_id"),
        )
        .drop(exclude_cols)
        .select(cols_to_keep)
    )

def get_clean_train_valid_eval(): 
    """ 
    Returns the datasets formatted for training and evaluation
    - X_train: All interactions except the last two per user
    - y_valid: Second to last interaction per user
    - y_test: Last interaction per user
    """
    X_train, y_valid, y_test = load_train_val_test_data()
    X_train = format_train_eval_data(X_train)
    y_valid = format_train_eval_data(y_valid)
    y_test = format_train_eval_data(y_test)
    assert total_len == len(X_train.collect()) + len(y_valid.collect()) + len(y_test.collect())
    return X_train, y_valid, y_test

def join_x_y(X, y):
    """
    Joins the training and validation datasets on each user 
    - X_test: All interactions except the last one per user
    """
    X_join = pl.concat([X, y], how="vertical").sort("idx")
    assert len(X_join.collect()) == len(X.collect()) + len(y.collect())
    return X_join

## Baseline Experiments

### Experiment 1: Popularity

In [26]:
def get_k_most_popular_items(X, k=50): 
    """
    Returns the k most popular items in X
    """
    popular_items = (
        X
        .group_by("product_id")
        .agg(pl.count("product_id").alias("count"))
        .sort("count", descending=True)
        .head(k)
    )
    return popular_items
    
def predict_based_on_item_popularity(X_train, y_valid, y_test, k=50): 
    """
    Predicts the most popular items for each user
    """
    top_k_items = get_k_most_popular_items(
        join_x_y(X_train, y_valid),
        k)
    return y_test.join(top_k_items, on="product_id")

def popularity_baseline(k_vals = k_vals):
    X_train, y_valid, y_test = get_clean_train_valid_eval()
    
    hits_at_k = []
    precision_at_k = []

    for k in k_vals:
        y_pred = predict_based_on_item_popularity(X_train, y_valid, y_test, k = k)
        hits = len(y_pred.join(y_test, on=["user_id", "product_id"]).collect())

        num_predictions = len(y_pred.collect())
        number_of_users = len(y_test.collect()) 

        hits_at_k.append(hits / number_of_users)

        # Number of hits divided by number of predictions
        precision_at_k.append(hits / k / number_of_users)

    results = build_results_df("popularity", k_vals, hits_at_k, precision_at_k)

    return results

In [27]:
popularity_baseline().write_csv("results/popularity_baseline.csv")

### Experiment 2: Markov Chain 

In [28]:
def get_transition_matrix_from_markov_chain(X):
    """
    Builds a transition matrix from transition counts
    OLD Method, stopped using it since it cannot be Lazy
    """
    return (
            X.collect()
            .pivot(
                on="prev_product_id",
                index="product_id",
                values="transitions",
                aggregate_function="sum"
            )
            .fill_null(0)
            .rename({"product_id": "next_product_id"})
            .with_columns(pl.all().exclude("next_product_id") / pl.all().sum())
            .lazy()
        )

def get_matrix_shifted_by_3(X):
    """
    Returns matrix shifted by 3
    """
    return (
        X.with_columns([
            pl.col("product_id").shift(1).alias("prev_1_product_id"),
            pl.col("user_id").shift(1).alias("prev_1_user_id"),
            pl.col("product_id").shift(2).alias("prev_2_product_id"),
            pl.col("user_id").shift(2).alias("prev_2_user_id"),
            pl.col("product_id").shift(3).alias("prev_3_product_id"),
            pl.col("user_id").shift(3).alias("prev_3_user_id"),
        ])
        .filter(pl.col("prev_3_product_id").is_not_null())
        .filter(pl.col("user_id") == pl.col("prev_3_user_id"))
    )

def get_previous_n_transitions(X, n=1):
    """
    Returns the previous n transitions for each user weighted by how far back they are
    """
    prev_column = f"prev_{n}_product_id"
    return (
        X
        .group_by([prev_column, "product_id"])
        .len().rename({"len": "transitions"})
        .with_columns([pl.col("transitions") / n])
        .rename({prev_column: "prev_product_id"})
    )

def get_3rd_order_markov_chain(X):
    """
    Returns the transition probabilities for a 3rd order Markov chain
    """
    X_shifted = get_matrix_shifted_by_3(X)
    
    n_1_transition_counts = get_previous_n_transitions(X_shifted, n=1)
    n_2_transition_counts = get_previous_n_transitions(X_shifted, n=2)
    n_3_transition_counts = get_previous_n_transitions(X_shifted, n=3)

    transition_counts = pl.concat([n_1_transition_counts, n_2_transition_counts, n_3_transition_counts])
    
    transition_probs = (
        transition_counts
        .with_columns(pl.col("transitions") / pl.col("transitions").sum().over("prev_product_id"))
        .rename({
            "transitions": "probability",
            "product_id": "next_product_id",
            "prev_product_id": "product_id",
            })
    )
    
    return transition_probs

def predict_based_on_markov_chain(y, transition_matrix, k=50):
    """
    Predicts the next k items based on the transition matrix
    """
    y_pred =  (
        y
        .drop("idx")
        .join(transition_matrix, on="product_id", how="left")
        .group_by("user_id")
        .agg(
            [
                pl.struct(["next_product_id", "probability"])
                .sort_by("probability", descending=True)
                .slice(0, k)
                .alias("top_predictions")
            ]
        )
        .explode("top_predictions")
        .select(
            "user_id",
            pl.col("top_predictions").struct.field("next_product_id").alias("product_id"),
            pl.col("top_predictions").struct.field("probability").alias("probability"),
        )
    )
    return y_pred

def markov_chain_experiment(k_vals = k_vals):
    X_train, y_valid, y_test = get_clean_train_valid_eval()
    X_test = join_x_y(X_train, y_valid)

    transition_matrix = get_3rd_order_markov_chain(X_test)
    hits_at_k = []
    precision_at_k = []

    for k in k_vals:
        y_pred = predict_based_on_markov_chain(y_valid, transition_matrix, k = k)
        hits = len(y_pred.join(y_test, on=["user_id", "product_id"]).collect())

        num_predictions = len(y_pred.collect())
        number_of_users = len(y_test.collect()) 

        hits_at_k.append(hits / number_of_users)
        precision_at_k.append(hits / k / number_of_users)

    results = build_results_df("Markov Chain", k_vals=k_vals, hits_at_k=hits_at_k, precision_at_k=precision_at_k)

    return results

In [29]:
markov_chain_experiment().write_csv("results/markov_chain.csv")

## Transformer Experiments

### Dataset

In [52]:
def get_data(num_items=num_items):
    X_train, y_train, y_test = get_clean_train_valid_eval()
    train = join_x_y(X_train, y_train) 
    test = join_x_y(train, y_test)
    column_ordering = X_train.collect_schema().names()
    
    assert len(train.collect()) + len(y_test.collect()) == total_len
    assert len(test.collect()) == total_len
    
    train = train.group_by("user_id").tail(num_items).select(column_ordering)
    test = test.group_by("user_id").tail(num_items).select(column_ordering)

    return train, test

class SequentialRecommenderDataset(Dataset):
    def __init__(self, data, num_items=num_items):
        self.num_items = num_items
        self.num_users = data.group_by("user_id").len().collect().shape[0]
        self.samples = []

        data_lists = data.group_by("user_id").agg(
            pl.col("product_id")
            .tail(num_items)
        )

        # Matrix of num_users x item interactions
        self.samples = [
            (
                torch.tensor(product_list, dtype=torch.long),
            )
            for product_list in data_lists.collect()["product_id"].to_list()
        ]
        
        assert len(self.samples) == self.num_users


    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        # Returns the sequence of items and sequence of items shifted by 1 to predict 
        x = self.samples[idx, :-1]
        y = self.samples[idx, 1:]
        return x, y

def get_transformer_datasets():
    train, test = get_data()
    train_dataset = SequentialRecommenderDataset(train)
    test_dataset = SequentialRecommenderDataset(test)
    return train_dataset, test_dataset

### Experiment 3: Decoder