In [43]:
import pandas as pd
import numpy as np
import polars as pl

In [2]:
def create_popularity_baseline_polars(train_df, test_df):
    """
    Create a popularity-based baseline model for MIND dataset using Polars

    Args:
        train_df: Polars DataFrame with training data
        test_df: Polars DataFrame with test data

    Returns:
        Polars DataFrame with predictions for test set
    """
    # Extract clicked articles from training set
    # First, explode impressions
    train_impressions = (
        train_df.select(pl.col("impressions"))
        .with_columns(pl.col("impressions").str.split(" "))
        .explode("impressions")
    )

    # Then parse news_id and click
    article_clicks = (
        train_impressions.with_columns(
            [
                pl.col("impressions").str.split("-").list.get(0).alias("news_id"),
                pl.col("impressions").str.split("-").list.get(1).alias("click"),
            ]
        )
        .filter(pl.col("click") == "1")
        .group_by("news_id")
        .len()
        .rename({"len": "clicks"})
    )

    # Calculate global CTR
    impressions_count = train_impressions.shape[0]
    clicks_count = train_impressions.filter(
        pl.col("impressions").str.contains("-1$")
    ).shape[0]
    global_ctr = clicks_count / impressions_count if impressions_count > 0 else 0.1

    # Prepare test data
    test_exploded = (
        test_df.select(pl.col("impression_id"), pl.col("impressions"))
        .with_columns(pl.col("impressions").str.split(" "))
        .explode("impressions")
        .with_columns(
            [pl.col("impressions").str.split("-").list.get(0).alias("news_id")]
        )
    )

    # Generate random noise
    np.random.seed(42)  # For reproducibility
    test_exploded = test_exploded.with_columns(
        pl.lit(np.random.random(test_exploded.shape[0]) * 0.001).alias("noise")
    )

    # Join with article clicks
    predictions = (
        test_exploded.join(article_clicks, on="news_id", how="left")
        .with_columns(
            [
                pl.when(pl.col("clicks").is_null())
                .then(global_ctr + pl.col("noise"))
                .otherwise(pl.col("clicks") + pl.col("noise"))
                .alias("score")
            ]
        )
        .select("impression_id", "news_id", "score")
    )

    return predictions

In [3]:
def normalize_scores(predictions_df):
    """Normalize scores to [0,1] range within each impression using Polars"""
    df = predictions_df
    
    # Group by impression_id and compute min and max scores
    result = df.join(
        df.group_by("impression_id").agg([
            pl.col("score").min().alias("min_score"),
            pl.col("score").max().alias("max_score")
        ]),
        on="impression_id"
    )
    
    # Apply min-max normalization
    result = result.with_columns([
        pl.when(pl.col("max_score") > pl.col("min_score"))
          .then((pl.col("score") - pl.col("min_score")) / (pl.col("max_score") - pl.col("min_score")))
          .otherwise(pl.lit(0.5))
          .alias("score")
    ])
    
    # Drop temporary columns and select original columns
    result = result.drop(["min_score", "max_score"])
    
    return result

In [15]:
behavior_polars_train = pl.read_csv("data/MINDlarge_train/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_dev = pl.read_csv("data/MINDlarge_dev/behaviors.tsv", separator='\t', has_header=False)
behavior_polars_train.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']
behavior_polars_dev.columns = ['impression_id', 'user_id', 'timestamp', 'history', 'impressions']

In [16]:
baseline_predictions = create_popularity_baseline_polars(behavior_polars_train, behavior_polars_dev)


In [None]:
normalized_predictions = normalize_scores(baseline_predictions)

In [31]:
import lib.eval as eval
import importlib
importlib.reload(eval)

<module 'lib.eval' from '/Users/mathiasraa/Desktop/ntnu/recommender-systems/lib/eval.py'>

In [32]:
results = eval.evaluate_mind_predictions(normalized_predictions, behavior_polars_dev)

In [36]:
pd.DataFrame(results, index=[0])

Unnamed: 0,auc,mrr,ndcg@5,ndcg@10
0,0.552853,0.261856,0.269713,0.332056


In [42]:
pd.to_datetime(behavior_polars_dev["timestamp"]).sort_values(ascending=True)

DatetimeIndex(['2019-11-15 00:00:00', '2019-11-15 00:00:01',
               '2019-11-15 00:00:02', '2019-11-15 00:00:04',
               '2019-11-15 00:00:05', '2019-11-15 00:00:06',
               '2019-11-15 00:00:06', '2019-11-15 00:00:07',
               '2019-11-15 00:00:07', '2019-11-15 00:00:09',
               ...
               '2019-11-15 23:59:17', '2019-11-15 23:59:20',
               '2019-11-15 23:59:22', '2019-11-15 23:59:23',
               '2019-11-15 23:59:25', '2019-11-15 23:59:39',
               '2019-11-15 23:59:39', '2019-11-15 23:59:40',
               '2019-11-15 23:59:41', '2019-11-15 23:59:43'],
              dtype='datetime64[ns]', length=376471, freq=None)

In [41]:
pd.to_datetime(behavior_polars_train["timestamp"]).sort_values(ascending=True)

DatetimeIndex(['2019-11-09 00:00:00', '2019-11-09 00:00:02',
               '2019-11-09 00:00:03', '2019-11-09 00:00:07',
               '2019-11-09 00:00:13', '2019-11-09 00:00:14',
               '2019-11-09 00:00:17', '2019-11-09 00:00:19',
               '2019-11-09 00:00:24', '2019-11-09 00:00:25',
               ...
               '2019-11-14 23:59:49', '2019-11-14 23:59:50',
               '2019-11-14 23:59:53', '2019-11-14 23:59:53',
               '2019-11-14 23:59:53', '2019-11-14 23:59:54',
               '2019-11-14 23:59:54', '2019-11-14 23:59:54',
               '2019-11-14 23:59:59', '2019-11-14 23:59:59'],
              dtype='datetime64[ns]', length=2232748, freq=None)