In [None]:
import tensorflow as tf
import pandas as pd

from models.v2.two_towers.two_towers_match_model import TwoTowersMatchModel
from models.v2.two_towers.two_towers_model import TwoTowersModel
from models.v1.utils import identity_loss
from models.v2.utils import load_triplets_dataset
from utils import ENV_SHORT_NAME


N_EPOCHS = 1000
VERBOSE = 0 if ENV_SHORT_NAME == "prod" else 1
LOSS_CUTOFF = 0.005

In [None]:
user_layer_infos = {
    "user_id": {"type": "string", "feature_latent_dim": 128},
    "user_age": {"type": "int", "feature_latent_dim": 16},
    "user_postal_code": {"type": "string", "feature_latent_dim": 8},
    "user_activity": {"type": "string", "feature_latent_dim": 8},
    "user_booking_cnt": {"type": "int", "feature_latent_dim": 16},
    "user_theoretical_amount_spent": {"type": "int", "feature_latent_dim": 16},
    "user_theoretical_remaining_credit": {"type": "int", "feature_latent_dim": 16},
    "user_distinct_type_booking_cnt": {"type": "int", "feature_latent_dim": 16},
}
item_layer_infos = {
    "item_id": {"type": "string", "feature_latent_dim": 128},
    "offer_categoryId": {"type": "string", "feature_latent_dim": 16},
    "offer_subcategoryid": {"type": "string", "feature_latent_dim": 16},
    "item_names": {"type": "text", "feature_latent_dim": 16},
    "item_rayons": {"type": "text", "feature_latent_dim": 16},
    "item_author": {"type": "text", "feature_latent_dim": 16},
    "item_performer": {"type": "text", "feature_latent_dim": 16},
    "item_mean_stock_price": {"type": "int", "feature_latent_dim": 8},
    "item_booking_cnt": {"type": "int", "feature_latent_dim": 8},
    "item_favourite_cnt": {"type": "int", "feature_latent_dim": 8},
}

# Datasets

In [None]:
batch_size = 4096

STORAGE_PATH = "gs://mlflow-bucket-ehp/algo_training_stg/algo_training_v2_two_tower_20230111T094741"

# Load BigQuery data
train_data = pd.read_csv(f"{STORAGE_PATH}/positive_data_train.csv").astype(str)
validation_data = pd.read_csv(f"{STORAGE_PATH}/positive_data_eval.csv").astype(str)

user_columns = list(user_layer_infos.keys())
item_columns = list(item_layer_infos.keys())

user_data = train_data[user_columns].drop_duplicates(subset=["user_id"])
item_data = train_data[item_columns].drop_duplicates(subset=["item_id"])

# Create tf datasets
train_dataset = load_triplets_dataset(
    train_data, user_columns=user_columns, item_columns=item_columns, batch_size=batch_size
)
validation_dataset = load_triplets_dataset(
    validation_data, user_columns=user_columns, item_columns=item_columns, batch_size=batch_size
)

# Training

In [None]:
embedding_size = 64

two_tower_model = TwoTowersModel(
    user_data=user_data,
    user_layer_infos=user_layer_infos,
    item_data=item_data,
    item_layer_infos=item_layer_infos,
    embedding_size=embedding_size,
)

two_tower_model.compile(loss=identity_loss, optimizer="adam")

two_tower_model.fit(
    train_dataset,
    epochs=N_EPOCHS,
    validation_data=validation_dataset,
    verbose=VERBOSE,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.1,
            patience=2,
            min_delta=LOSS_CUTOFF,
        ),
        tf.keras.callbacks.EarlyStopping(
            monitor="val_loss",
            patience=3,
            min_delta=LOSS_CUTOFF,
        ),
    ],
)

# Embeddings

In [None]:
user_embeddings = two_tower_model.user_model([user_data.values])
item_embeddings = two_tower_model.item_model([item_data.values])

match_model_last_layer = TwoTowersMatchModel(
    user_ids=user_data["user_id"].unique(),
    user_embeddings=user_embeddings,
    item_ids=item_data["item_id"].unique(),
    item_embeddings=item_embeddings,
    embedding_size=embedding_size,
)

In [None]:
match_model_first_layer = TwoTowersMatchModel(
    user_ids=two_tower_model.user_model.layers[0].layers[0].get_vocabulary()[1:],
    user_embeddings=two_tower_model.user_model.layers[0].layers[1].get_weights()[0][1:],
    item_ids=two_tower_model.item_model.layers[0].layers[0].get_vocabulary()[1:],
    item_embeddings=two_tower_model.item_model.layers[0].layers[1].get_weights()[0][1:],
    embedding_size=embedding_size,
)

# PCA Plot

In [None]:
from sklearn.decomposition import PCA
import matplotlib as mpl
import matplotlib.pyplot as plt


colormap = mpl.colormaps["tab20"].colors
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

item_ids = match_model_first_layer.item_layer.layers[0].get_vocabulary()[1:]
embeddings = match_model_first_layer.item_layer.layers[1].get_weights()[0][1:]

pca_out = PCA(n_components=2).fit_transform(embeddings)
categories = item_data["offer_categoryId"].unique().tolist()
item_representation = pd.DataFrame(
    {
        "item_id": item_ids,
        "x": pca_out[:, 0],
        "y": pca_out[:, 1],
    }
).merge(train_data[item_columns].drop_duplicates(subset=["item_id"]), on=["item_id"], how="inner")

for idx, category in enumerate(categories):
    data = item_representation.loc[lambda df: df["offer_categoryId"] == category]
    ax.scatter(
        data["x"].values,
        data["y"].values,
        s=10,
        color=colormap[idx],
        label=category,
        alpha=0.7,
    )

ax.legend()
ax.grid(True)

# Evaluation

In [None]:
from utils import (
    RECOMMENDATION_NUMBER,
    NUMBER_OF_PRESELECTED_OFFERS,
    EVALUATION_USER_NUMBER,
)
from metrics import compute_metrics, get_actual_and_predicted

positive_data_test = pd.read_csv(
        f"{STORAGE_PATH}/positive_data_test.csv",
        dtype={
            "user_id": str,
            "item_id": str,
        },
    ).assign(genres="").assign(type="").assign(rayon="")

users_to_test = positive_data_test["user_id"].unique()[
    : min(EVALUATION_USER_NUMBER, positive_data_test["user_id"].nunique())
]
positive_data_test = positive_data_test.loc[
    lambda df: df["user_id"].isin(users_to_test)
]

data_model_dict = {
    "data": {
        "raw": train_data.assign(genres="").assign(type="").assign(rayon=""),
        "training_item_ids": train_data.item_id.unique(),
        "test": positive_data_test,
    },
    "model": match_model,
}
data_model_dict_w_actual_and_predicted = get_actual_and_predicted(data_model_dict)

metrics = {}
k_list = [RECOMMENDATION_NUMBER, NUMBER_OF_PRESELECTED_OFFERS]
for k in k_list:
    data_model_dict_w_metrics_at_k = compute_metrics(
        data_model_dict_w_actual_and_predicted, k
    )

    metrics[f"recall_at_{k}"] = data_model_dict_w_metrics_at_k["metrics"]["mark"]
    metrics[f"precision_at_{k}"] = data_model_dict_w_metrics_at_k["metrics"]["mapk"]

    # Here we track metrics relate to pcreco output
    if k == RECOMMENDATION_NUMBER:

        metrics[f"recall_at_{k}_panachage"] = data_model_dict_w_metrics_at_k[
            "metrics"
        ]["mark_panachage"]
        metrics[f"precision_at_{k}_panachage"] = data_model_dict_w_metrics_at_k[
            "metrics"
        ]["mapk_panachage"]

        # AVG diverisification score is only calculate at k=RECOMMENDATION_NUMBER to match pcreco output
        metrics[
            f"avg_diversification_score_at_{k}"
        ] = data_model_dict_w_metrics_at_k["metrics"]["avg_div_score"]

        metrics[
            f"avg_diversification_score_at_{k}_panachage"
        ] = data_model_dict_w_metrics_at_k["metrics"]["avg_div_score_panachage"]

        metrics[
            f"personalization_at_{k}_panachage"
        ] = data_model_dict_w_metrics_at_k["metrics"][
            "personalization_at_k_panachage"
        ]

    metrics[f"coverage_at_{k}"] = data_model_dict_w_metrics_at_k["metrics"][
        "coverage"
    ]

    metrics[f"personalization_at_{k}"] = data_model_dict_w_metrics_at_k["metrics"][
        "personalization_at_k"
    ]

metrics