In [None]:
import time 

notebook_start_time = time.time()

# Set up environment

In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    install_dependencies()
    root_dir = str(Path().absolute())
    print("⛳️ Google Colab environment")
else:
    root_dir = str(Path('/working/').absolute().parent)
    print("⛳️ Local environment")

# Add the root directory to the PYTHONPATH
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

# Feature pipeline

In [None]:
%load_ext autoreload
%autoreload 2

import warnings
from pprint import pprint

import polars as pl
import torch
from loguru import logger
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")

from recsys import hopsworks_integration
from recsys.config import settings
from recsys.features.articles import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from recsys.features.customers import DatasetSampler, compute_features_customers
from recsys.features.interaction import generate_interaction_data
# from recsys.features.ranking import compute_ranking_dataset
from recsys.features.transactions import compute_features_transactions
from recsys.hopsworks_integration import feature_store
from recsys.raw_data_sources import h_and_m as h_and_m_raw_data

# Articles Data:

- Article Id - it is similar to SKU id, each and every product have unique id.
- Product Code - it is unique but at a specific product or style level. for example A Shirt have a product code but it have multiple sizes small, medium, large, XL so all these sizes will have same product code but different article id.

In [None]:
articles_df = pl.read_parquet('articles.parquet')
articles_df.shape

## Articles feature engineering

In [None]:
articles_df = compute_features_articles(articles_df)
articles_df.shape

In [None]:
articles_df = compute_features_articles(articles_df)
articles_df.shape
articles_df.head(3)

#### Create embedding from the articles description

In [None]:
for i, desc in enumerate(articles_df["article_description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
logger.info(
    f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
)

# Load the embedding model from SentenceTransformer's model registry.
model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

In [None]:
embedding = generate_embeddings_for_dataframe(
    articles_df.head(1), "article_description", model, batch_size=128
)['embeddings']
embedding

In [None]:
articles_df = articles_df.with_columns(
    pl.lit(embedding[0].to_list()).alias("embeddings")
)
articles_df.head(3)

In [None]:
articles_df[["article_description", "embeddings"]].head(3)

In [None]:
articles_df["image_url"][3]

### Images 

In [None]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

# Customers Data

In [None]:
customers_df = h_and_m_raw_data.extract_customers_df()
customers_df.shape

In [None]:
customers_df.head(3)

In [None]:
customers_df.null_count() # checking nulls

In [None]:
customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.shape

## Transactions Data

In [None]:
transactions_df = h_and_m_raw_data.extract_transactions_df()
transactions_df.shape

In [None]:
# converting string date column to datetime
transactions_df = transactions_df.with_columns(
    pl.col("t_dat").str.strptime(pl.Datetime, "%Y-%m-%d"))

transactions_df.head()

In [None]:
transactions_df = compute_features_transactions(transactions_df)
transactions_df.shape

#### Sampling from the dataset 

In [None]:
sampler = DatasetSampler(size=settings.CUSTOMER_DATA_SIZE)
dataset_subset = sampler.sample(
    customers_df=customers_df, transations_df=transactions_df
)
customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

## Interaction Data

To train our models, we need more than just the transactions DataFrame.  We need positive samples that signal whether a customer clicked or bought an item, but we also need negative samples that signal no interactions between a customer and an item.

In [None]:
interaction_df = generate_interaction_data(transactions_df)
interaction_df.shape

In [None]:
interaction_df.group_by("interaction_score").agg(
    pl.count("interaction_score").alias("total_interactions")
)

In [None]:
transactions_df.shape

In [None]:
customers_fg = customers_df.clone()
trans_fg = transactions_df.clone()
articles_fg = articles_df.clone()
interactions_fg  = interaction_df.clone()

## Compute ranking dataset

The last step is to compute the ranking dataset used to train the scoring/ranking model from the feature groups we've just created:

In [None]:
articles_fg.head(1)

In [None]:
def compute_ranking_dataset_new(trans_fg, articles_fg, customers_fg) -> pl.DataFrame:
    # Read data from the feature groups
    trans_df = trans_fg.select(
        ["article_id", "customer_id"]
    )
    articles_df = articles_fg.select(pl.exclude(["article_description", "embeddings", "image_url"]))
    customers_df = customers_fg.select(["customer_id", "age"])

    # Convert article_id to string in both dataframes before joining
    trans_df = trans_df.with_columns(pl.col("article_id").cast(pl.Utf8))
    articles_df = articles_df.with_columns(pl.col("article_id").cast(pl.Utf8))

    # Merge operations
    df = trans_df.join(articles_df, on="article_id", how="left")
    df = df.join(customers_df, on="customer_id", how="left")

    # Select query features
    query_features = ["customer_id", "age", "article_id"]
    df = df.select(query_features)

    # Create positive pairs
    positive_pairs = df.clone()

    # Calculate number of negative pairs
    n_neg = len(positive_pairs) * 10

    # Create negative pairs DataFrame
    article_ids = (df.select("article_id")
                    .unique()
                    .sample(n=n_neg, with_replacement=True, seed=2)
                    .get_column("article_id"))
    
    customer_ids = (df.select("customer_id")
                     .sample(n=n_neg, with_replacement=True, seed=3)
                     .get_column("customer_id"))

    other_features = (df.select(["age"])
                       .sample(n=n_neg, with_replacement=True, seed=4))

    # Construct negative pairs
    negative_pairs = pl.DataFrame({
        "article_id": article_ids,
        "customer_id": customer_ids,
        "age": other_features.get_column("age"),
    })

    # Add labels
    positive_pairs = positive_pairs.with_columns(pl.lit(1).alias("label"))
    negative_pairs = negative_pairs.with_columns(pl.lit(0).alias("label"))

    # Concatenate positive and negative pairs
    ranking_df = pl.concat([
        positive_pairs,
        negative_pairs.select(positive_pairs.columns)
    ])

    # Process item features
    item_df = articles_fg
    
    # Convert article_id to string in item_df before final join
    item_df = item_df.with_columns(pl.col("article_id").cast(pl.Utf8))
    
    # Keep unique article_ids and select columns
    item_df = (
        item_df.unique(subset=["article_id"])
        .select([
            "article_id",
            "product_type_name",
            "product_group_name",
            "graphical_appearance_name",
            "colour_group_name",
            "perceived_colour_value_name",
            "perceived_colour_master_name",
            "department_name",
            "index_name",
            "index_group_name",
            "section_name",
            "garment_group_name",
        ])
    )

    # Final merge with item features
    ranking_df = ranking_df.join(item_df, on="article_id", how="left")

    return ranking_df

In [None]:
# changed the function here 
ranking_df = compute_ranking_dataset_new(
    trans_fg,
    articles_fg,
    customers_fg,
)
ranking_df.shape

In [None]:
ranking_df.head(3)

In [None]:
ranking_df.get_column("label").value_counts()