# NMRS: Hyperparameter Optimization for the NMRS Model

This notebook focuses on optimizing the hyperparameters for the NMRS model to achieve the better performance. For this approach we are using a search grid containing following search space:

```python
param_grid = {
    'history_size': [10, 50, 100],
    'n_users':  [20000, 50000, 70000],
    'title_size': [10, 50, 100],
    'learning_rate': [0.0001, 0.001, 0.01],
    'dropout':  [0.1, 0.3, 0.5]
}
```

The NRMS model is a state-of-the-art neural news recommendation approach with multi-head self-attention. NRMS learns news representations from news titles, using a multi head self-attention network. Briefly speaking, the news encoder first maps each word in the news title to the corresponding vector, and then uses the self-attention network to learn word-level representations. Finally, a query vector is used to locate the important words in the news title, and an attention-based pooling method is used to aggregate the word-level representations into the learned title representations. To learn user representations from their browsed news, NRMS again uses the multi-head self-attention network on top of the learned news representations. The probability of a user clicking a candidate news is given by the dot product between the user representations and the news representations.

In [2]:
import os
import itertools
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl
from tabulate import tabulate

# Import Constants and Utilities
from ebrec.utils._constants import (
   DEFAULT_HISTORY_ARTICLE_ID_COL,
   DEFAULT_CLICKED_ARTICLES_COL,
   DEFAULT_INVIEW_ARTICLES_COL,
   DEFAULT_IMPRESSION_ID_COL,
   DEFAULT_SUBTITLE_COL,
   DEFAULT_LABELS_COL,
   DEFAULT_TITLE_COL,
   DEFAULT_USER_COL,
)

#Import Utility Functions
#
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings
from ebrec.utils._python import write_submission_file, rank_predictions_by_score
from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore  
from ebrec.models.newsrec.model_config import hparams_lstur

#Import NRMS Components
#
from ebrec.models.newsrec.dataloader import NRMSDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import NRMSModel

2024-07-08 17:26:18.700438: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-08 17:26:18.794582: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-08 17:26:18.794692: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-08 17:26:18.798042: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-08 17:26:18.817805: I tensorflow/core/platform/cpu_feature_guar

## Load and Process Behavior and History Parquet Files
The functions below are necessary for transforming the Ebnerd Dataset tables (`history.parquet` and `behaviors.parquet`) into a format suitable for training and testing. This transformation is achieved by joining the histories and behaviors based on the `user ID`. Additionally, preprocessing steps are performed, such as truncating the user history to keep only the specified `history_size`.

In [4]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors

## Setup Path and Data Configuration
Here we setup the `PATH` to the ebenrd dataset we are using for training. `COLUMNS` define the columns we are using for the model training, `TEXT_COLUMNS_TO_USE` contains columns should be considered in the embedding process.

In [None]:
MODEL_NAME = "NRMS"

DATA_PATH = Path("~/shared/194.035-2024S/groups/Gruppe_33/Group_33/data")
DATASPLIT = "demo"

COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
]

HISTORY_SIZE = 50
MAX_TITLE_LENGTH = 50

TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
FRACTION = 1

In [None]:
df_train = (
    ebnerd_from_path(DATA_PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

df_validation = (
    ebnerd_from_path(DATA_PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
df_train.head(2)

## Load the Articles
In this cell we are loading the articles into memory which will later be used for the embedding.

In [None]:
df_articles = pl.read_parquet(DATA_PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(2)

## Initialize and Configure Transformer Model

This cell loads a pre-trained transformer model and tokenizer from Hugging Face, specifically FacebookAI/xlm-roberta-base, establishing the NLP backbone for the notebook. The transformer model is critical for transforming raw text data into structured embeddings that can be effectively utilized within the recommendation system. The following steps are executed:
- **Load Transformer Model and Tokenizer**: The AutoModel and AutoTokenizer from Hugging Face are used to load the pre-trained xlm-roberta-base model.
- **Initialize Word Embeddings**: Word embeddings are initialized using the transformer's word embeddings to enhance the text representation.
- **Concatenate Text Columns**: Text columns from the articles dataframe are concatenated to create a comprehensive text field.
- **Convert Text to Encodings**: The concatenated text is tokenized and converted to numerical encodings using the transformer tokenizer, with a specified maximum length.
- **Create Article Mapping**: A mapping from article IDs to their corresponding tokenized values is created, facilitating efficient lookup and processing in the recommendation pipeline.

In [8]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

### Data Loading for Model Input and Model Configuration
This cell creates data loaders for both training and validation. The `LSTURDataLoader` is initialized for both training and validation datasets, handling batching, shuffling, and input feature construction.

In [12]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=64,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=32,
)

### Look into NRMS hyperparamter attributes
This cell is necessery to check which hyperparamter were are interested in optimizing.

In [3]:
# Get attributes of hparams_lstur
attributes = dir(hparams_nrms)

# Prepare data for tabulate
data = [(attr, getattr(hparams_nrms, attr)) for attr in attributes if not attr.startswith('__')]

# Print as a table
print(tabulate(data, headers=["Attribute", "Value"]))

Attribute             Value
--------------------  ------------------
attention_hidden_dim  200
dropout               0.2
head_dim              20
head_num              20
history_size          50
learning_rate         0.0001
loss                  cross_entropy_loss
optimizer             adam
title_size            30


# Hyperparameter Optimization for NRMS Model
In this section, we perform hyperparameter optimization to find the best combination of parameters for the LSTUR model. The optimization focuses on input dimensions such as `history_size`, `n_users`, and `title_size`, as well as model-specific parameters like `learning_rate` and `dropout`. The process involves evaluating different combinations of these parameters to identify the configuration that yields the best performance.

### Setting Up Hyperparameter Optimization
The `objective` function is defined to train the LSTUR model with a given set of hyperparameters and evaluate its performance. The function creates directories for logs and model weights, sets the hyperparameters, trains the model, and then evaluates it. The evaluation results are saved, and predictions are written to a submission file.

In [None]:
# Performing Hyperopt
def objective(learning_rate, dropout, df_validation,df_train):

    # Create directories for logs and model weights
    MODEL_NAME = f"NRMS_l{learning_rate}_d{dropout}"
    LOG_DIR = f"downloads/runs/{MODEL_NAME}"
    MODEL_WEIGHTS = f"downloads/data/state_dict/{MODEL_NAME}/weights"
    RESULTS_DIR = f"downloads/evaluations/{MODEL_NAME}"
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
    modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
    )

    # Set the parameters
    hparams_nrms.history_size = HISTORY_SIZE
    hparams_nrms.title_size = MAX_TITLE_LENGTH
    hparams_nrms.learning_rate = learning_rate
    hparams_nrms.dropout = dropout
    

    model = NRMSModel(
        hparams=hparams_lstur,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    hist = model.model.fit(
        train_dataloader,
        validation_data=val_dataloader,
        epochs=1,
        callbacks=[tensorboard_callback, early_stopping, modelcheckpoint],
    )
    _ = model.model.load_weights(filepath=MODEL_WEIGHTS)


    
    pred_validation = model.scorer.predict(val_dataloader)
    df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
        add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
    )

    metrics = MetricEvaluator(
        labels=df_validation["labels"].to_list(),
        predictions=df_validation["scores"].to_list(),
        metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
    )
    evaluation_results = metrics.evaluate().evaluations

    # Save the evaluation results
    os.makedirs(RESULTS_DIR, exist_ok=True)
    with open(os.path.join(RESULTS_DIR, 'evaluation_results.txt'), 'w') as f:
        for key, value in evaluation_results.items():
            f.write(f"{key}: {value}\n")

    # Rank predictions and write submission file
    df_validation = df_validation.with_columns(
        pl.col("scores")
        .map_elements(lambda x: list(rank_predictions_by_score(x)))
        .alias("ranked_scores")
    )
    write_submission_file(
        impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
        prediction_scores=df_validation["ranked_scores"],
        path=os.path.join(RESULTS_DIR, "predictions.txt"),
    )

    return evaluation_results

param_grid = {
    'history_size': [10, 50, 100],
    'n_users':  [20000, 50000, 70000],
    'title_size': [10, 50, 100],
    'learning_rate': [0.0001, 0.001, 0.01],
    'dropout':  [0.2],
    'batch_size': 64,
}

combinations = list(
    itertools.product(
        # param_grid['history_size'], param_grid['n_users'], param_grid['title_size'], 
        param_grid['learning_rate'], param_grid['dropout']
    )
)

all_results = []
for learning_rate, dropout in combinations:
    print(f"Evaluating combination: history_size={history_size}, n_users={n_users}, title_size={title_size}")
    result = objective(learning_rate, dropout, df_validation, df_train)
    all_results.append({
        'learning_rate': learning_rate,
        'dropout': dropout,
        'evaluation_results': result
    })

# Save all results to a file
with open("downloads/evaluations/all_results.txt", 'w') as f:
    for result in all_results:
        f.write(f"{result}\n")

print("All combinations evaluated.")