# Getting started

## Setting Up ebrec Environment: Cloning, Installing Dependencies, and Libraries
### Here we will be  managing dependencies using Poetry and installing necessary packages. 

In [1]:
#Cloning our ebrec
!git clone https://github.com/ebanalyse/ebnerd-benchmark.git

#Poetry
#!curl -sSL https://install.python-poetry.org | python3
#!python3 -m poetry shell
#!python3 -m poetry add ebrec

import os
import sys

#Pip
#!pip install ebrec
!pip install polars
!pip install transformers tensorflow polars

fatal: destination path 'ebnerd-benchmark' already exists and is not an empty directory.
Collecting polars
  Using cached polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Using cached polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
Installing collected packages: polars
Successfully installed polars-0.20.31


In [2]:
%cd ebnerd-benchmark
!pip install .

/home/e12242664/Group_33/notebooks/ebnerd-benchmark
Processing /home/e12242664/Group_33/notebooks/ebnerd-benchmark
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting transformers<4.37.3,>=4.30.0 (from ebrec==0.0.1)
  Using cached transformers-4.37.2-py3-none-any.whl.metadata (129 kB)
Collecting scikit-learn==1.4.0 (from ebrec==0.0.1)
  Using cached scikit_learn-1.4.0-1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy<1.26.1,>=1.24.0 (from ebrec==0.0.1)
  Using cached numpy-1.26.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
Collecting polars==0.20.8 (from ebrec==0.0.1)
  Using cached polars-0.20.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached polars-0.20.8-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.8 MB)
Using cached scikit_l

## Setting Up Data Processing and Model Initialization

In [3]:
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import tensorflow as tf
import polars as pl

from ebrec.utils._constants import (
   DEFAULT_HISTORY_ARTICLE_ID_COL,
   DEFAULT_CLICKED_ARTICLES_COL,
   DEFAULT_INVIEW_ARTICLES_COL,
   DEFAULT_IMPRESSION_ID_COL,
   DEFAULT_SUBTITLE_COL,
   DEFAULT_LABELS_COL,
   DEFAULT_TITLE_COL,
   DEFAULT_USER_COL,
)


#
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    add_known_user_column,
    add_prediction_scores,
    truncate_history,
)
from ebrec.utils._articles import convert_text2encoding_with_transformers
from ebrec.utils._polars import concat_str_columns, slice_join_dataframes
from ebrec.utils._articles import create_article_id_to_value_mapping
from ebrec.utils._nlp import get_transformers_word_embeddings

#
from ebrec.models.newsrec.dataloader import LSTURDataLoader
from ebrec.models.newsrec.model_config import hparams_nrms
from ebrec.models.newsrec import LSTURModel

2024-06-21 07:37:32.018543: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-21 07:37:32.018623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-21 07:37:32.022539: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-21 07:37:32.041635: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Loading and Preprocessing Ebnerd Data

In [4]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select(DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL)
        .pipe(
            truncate_history,
            column=DEFAULT_HISTORY_ARTICLE_ID_COL,
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on=DEFAULT_USER_COL,
            how="left",
        )
    )
    return df_behaviors

# Data Loading and Preprocessing for Training, Validation, and Testing
### This code snippet loads and preprocesses data from the Ebnerd dataset for three purposes: training, validation, and testing.

In [5]:
PATH = Path("/home/e12242664/shared/194.035-2024S/groups/Gruppe_33/Group_33/downloads")
DATASPLIT = "demo"
COLUMNS = [
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
]
HISTORY_SIZE = 10
FRACTION = 0.01

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

df_test = (ebnerd_from_path(PATH.joinpath("test"), history_size=HISTORY_SIZE)
    .select([
    DEFAULT_USER_COL,
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_IMPRESSION_ID_COL,
        
])
            .sample(fraction=FRACTION)
          )
    
df_train.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels
u32,list[i32],list[i64],list[i64],u32,list[i8]
1148586,"[9769994, 9769478, … 9769765]","[9774542, 9774542, … 9774554]",[9774554],308255978,"[0, 0, … 1]"
850523,"[9738263, 9738105, … 9755181]","[9759955, 9778661, … 9778444]",[9759955],237203945,"[1, 0, … 0]"


## Look at the difference between Training/Validation and Testset
### Note, the testset doesn't include labels, and we have remove some of the other columns.

In [6]:
print("df_train:")
print(df_train.head(2))

print("\ndf_validation:")
print(df_validation.head(2))

print("\ndf_test:")
print(df_test.head(2))

df_train:
shape: (2, 6)
┌─────────┬───────────────────┬───────────────────┬──────────────────┬───────────────┬─────────────┐
│ user_id ┆ article_id_fixed  ┆ article_ids_invie ┆ article_ids_clic ┆ impression_id ┆ labels      │
│ ---     ┆ ---               ┆ w                 ┆ ked              ┆ ---           ┆ ---         │
│ u32     ┆ list[i32]         ┆ ---               ┆ ---              ┆ u32           ┆ list[i8]    │
│         ┆                   ┆ list[i64]         ┆ list[i64]        ┆               ┆             │
╞═════════╪═══════════════════╪═══════════════════╪══════════════════╪═══════════════╪═════════════╡
│ 1148586 ┆ [9769994,         ┆ [9774542,         ┆ [9774554]        ┆ 308255978     ┆ [0, 0, … 1] │
│         ┆ 9769478, …        ┆ 9774542, …        ┆                  ┆               ┆             │
│         ┆ 9769765]          ┆ 9774554]          ┆                  ┆               ┆             │
│ 850523  ┆ [9738263,         ┆ [9759955,         ┆ [9759955]      

## Displaying the First 5 Rows of df_articles

In [7]:
 df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT, "articles.parquet"))
df_articles.head(5)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spill…","""ISHOCKEY: Isho…",2023-06-29 06:20:57,False,"""Ambitionerne o…",2003-08-28 08:55:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tv…","""Hoffet tvang P…",2023-06-29 06:21:16,False,"""Den britiske t…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_defaul…","""https://ekstra…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""
3057622,"""Rådden kørsel …","""Kan ikke straf…",2023-06-29 06:21:24,False,"""Slingrende spr…",2005-10-10 07:20:00,[3047102],"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Transportmiddel"", ""Bil""]",118,[133],"""nyheder""",,,,0.9236,"""Negative"""
3073151,"""Mærsk-arvinger…","""FANGET I FLODB…",2023-06-29 06:21:38,False,"""To oldebørn af…",2005-01-04 06:59:00,"[3067474, 3067478, 3153705]","""article_defaul…","""https://ekstra…",[],[],"[""Erhverv"", ""Privat virksomhed"", … ""Rejse""]",118,[133],"""nyheder""",,,,0.9945,"""Negative"""
3193383,"""Skød svigersøn…","""44-årig kvinde…",2023-06-29 06:22:57,False,"""En 44-årig mor…",2003-09-15 15:30:00,,"""article_defaul…","""https://ekstra…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9966,"""Negative"""


## Transforming Text Data with XLM-RoBERTa for Article Embeddings


In [8]:
 TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = [DEFAULT_SUBTITLE_COL, DEFAULT_TITLE_COL]
MAX_TITLE_LENGTH = 50

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

# Creating LSTUR DataLoaders for Training and Validation
In the implementations we have disconnected the models and data. Hence, you should built a dataloader that fits your needs.

In [12]:
user_id_mapping = {user_id: i for i, user_id in enumerate(df_train[DEFAULT_USER_COL].unique())}

train_dataloader = LSTURDataLoader(
    user_id_mapping=user_id_mapping,
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=False,
    batch_size=64,
)
val_dataloader = LSTURDataLoader(
    user_id_mapping=user_id_mapping,
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    eval_mode=True,
    batch_size=4,
)

## Train the model


In [14]:
import itertools
from ebrec.models.newsrec.model_config import hparams_lstur
from tabulate import tabulate

# Get attributes of hparams_lstur
attributes = dir(hparams_lstur)

# Prepare data for tabulate
data = [(attr, getattr(hparams_lstur, attr)) for attr in attributes if not attr.startswith('__')]

# Print as a table
print(tabulate(data, headers=["Attribute", "Value"]))

Attribute             Value
--------------------  ------------------
attention_hidden_dim  200
cnn_activation        relu
dropout               0.2
filter_num            400
gru_unit              400
history_size          50
learning_rate         0.0001
loss                  cross_entropy_loss
n_users               50000
optimizer             adam
title_size            30
type                  ini
window_size           3


## Hyperparameter Optimization for LSTUR Model Training
#### This script performs hyperparameter optimization for training an LSTUR model using TensorFlow. It evaluates different combinations of history_size, n_users, title_size, learning_rate, and dropout to find the optimal settings that maximize evaluation metrics like AUC, MRR, and NDCG at k=5 and k=10 on a validation dataset. Results are saved for each combination in downloads/evaluations/all_results.txt.

In [20]:
# Performing Hyperopt

def objective(history_size, n_users, title_size, learning_rate, dropout, df_validation,df_train):

    # Create directories for logs and model weights
    MODEL_NAME = f"LSTUR_l{learning_rate}_d{dropout}"
    LOG_DIR = f"downloads/runs/{MODEL_NAME}"
    MODEL_WEIGHTS = f"downloads/data/state_dict/{MODEL_NAME}/weights"
    RESULTS_DIR = f"downloads/evaluations/{MODEL_NAME}"
    
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
    modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
        filepath=MODEL_WEIGHTS, save_best_only=True, save_weights_only=True, verbose=1
    )

    # Set the parameters
    hparams_lstur.history_size = history_size
    hparams_lstur.n_users = n_users
    hparams_lstur.title_size = title_size
    hparams_lstur.learning_rate = learning_rate
    hparams_lstur.dropout = dropout
    

    model = LSTURModel(
        hparams=hparams_lstur,
        word2vec_embedding=word2vec_embedding,
        seed=42,
    )
    hist = model.model.fit(
        train_dataloader,
        validation_data=val_dataloader,
        epochs=1,
        callbacks=[tensorboard_callback, early_stopping, modelcheckpoint],
    )
    _ = model.model.load_weights(filepath=MODEL_WEIGHTS)


    
    pred_validation = model.scorer.predict(val_dataloader)
    df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
        add_known_user_column, known_users=df_train[DEFAULT_USER_COL]
    )

    from ebrec.evaluation import MetricEvaluator, AucScore, NdcgScore, MrrScore
    
    metrics = MetricEvaluator(
        labels=df_validation["labels"].to_list(),
        predictions=df_validation["scores"].to_list(),
        metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
    )
    evaluation_results = metrics.evaluate().evaluations

    # Save the evaluation results
    os.makedirs(RESULTS_DIR, exist_ok=True)
    with open(os.path.join(RESULTS_DIR, 'evaluation_results.txt'), 'w') as f:
        for key, value in evaluation_results.items():
            f.write(f"{key}: {value}\n")

    # Rank predictions and write submission file
    df_validation = df_validation.with_columns(
        pl.col("scores")
        .map_elements(lambda x: list(rank_predictions_by_score(x)))
        .alias("ranked_scores")
    )
    write_submission_file(
        impression_ids=df_validation[DEFAULT_IMPRESSION_ID_COL],
        prediction_scores=df_validation["ranked_scores"],
        path=os.path.join(RESULTS_DIR, "predictions.txt"),
    )

    return evaluation_results

param_grid = {
    'history_size': [25],
    'n_users':  [50000],
    'title_size': [27],
    'learning_rate': [0.0001, 0.001, 0.01],
    'dropout':  [0.2],
    'batch_size': 64,
}

combinations = list(
    itertools.product(
        param_grid['history_size'], param_grid['n_users'], param_grid['title_size'], param_grid['learning_rate'], param_grid['dropout']
    )
)

all_results = []
for history_size, n_users, title_size, learning_rate, dropout in combinations:
    print(f"Evaluating combination: history_size={history_size}, n_users={n_users}, title_size={title_size}")
    result = objective(history_size, n_users, title_size, learning_rate, dropout, df_validation, df_train)
    all_results.append({
        'history_size': history_size,
        'n_users': n_users,
        'title_size': title_size,
        'evaluation_results': result
    })

# Save all results to a file
with open("downloads/evaluations/all_results.txt", 'w') as f:
    for result in all_results:
        f.write(f"{result}\n")
        
import tensorflow as tf

print("All combinations evaluated.")

Evaluating combination: history_size=25, n_users=50000, title_size=27
KerasTensor(type_spec=TensorSpec(shape=(None, 400), dtype=tf.float32, name=None), name='att_layer2_2/Sum_1:0', description="created by layer 'att_layer2_2'")

2024-06-21 07:55:59.549311: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 46080368640 exceeds 10% of free system memory.



Epoch 1: val_loss improved from inf to 0.00000, saving model to downloads/data/state_dict/LSTUR_l0.0001_d0.2/weights


InvalidArgumentError: Graph execution error:

Detected at node model_5/reshape_4/Reshape defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/opt/conda/lib/python3.11/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/opt/conda/lib/python3.11/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/opt/conda/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/opt/conda/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/opt/conda/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/opt/conda/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 359, in execute_request

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 446, in do_execute

  File "/opt/conda/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/opt/conda/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/tmp/ipykernel_270/677300949.py", line 92, in <module>

  File "/tmp/ipykernel_270/677300949.py", line 40, in objective

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 2655, in predict

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 2440, in predict_function

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 2425, in step_function

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 2413, in run_step

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 2381, in predict_step

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/training.py", line 590, in __call__

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/functional.py", line 515, in call

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/functional.py", line 672, in _run_internal_graph

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 65, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/engine/base_layer.py", line 1149, in __call__

  File "/opt/conda/lib/python3.11/site-packages/keras/src/utils/traceback_utils.py", line 96, in error_handler

  File "/opt/conda/lib/python3.11/site-packages/keras/src/layers/reshaping/reshape.py", line 137, in call

Input to reshape is a tensor with 2800 values, but the requested shape has 1512
	 [[{{node model_5/reshape_4/Reshape}}]] [Op:__inference_predict_function_52363]