## <span style="color:#ff5f27">üë©üèª‚Äçüî¨ Feature Engineering </span>


In [None]:
import time

# Start the timer
notebook_start_time = time.time()

## <span style="color:#ff5f27">üìù Imports </span>

In [None]:
!pip install -r requirements.txt --quiet

In [None]:
import random
import polars as pl
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

from features.articles import (
    compute_features_articles,
    generate_embeddings_for_dataframe,
)
from features.customers import CustomerDatasetSize, DatasetSampler, compute_features_customers
from features.transactions import compute_features_transactions, month_cos, month_sin
from features.interaction import generate_interaction_data
from features.ranking import compute_ranking_dataset  

In [None]:
# Data size configuration
CUSTOMER_DATA_SIZE = CustomerDatasetSize.SMALL 

## <span style="color:#ff5f27">üîÆ Connect to Hopsworks Feature Store </span>

In [None]:
import hopsworks

project = hopsworks.login()

fs = project.get_feature_store()

## <span style="color:#ff5f27">üóÑÔ∏è Read Articles Data</span>

The **article_id** and **product_code** serve different purposes in the context of H&M's product database:

- **Article ID**: This is a unique identifier assigned to each individual article within the database. It is typically used for internal tracking and management purposes. Each distinct item or variant of a product (e.g., different sizes or colors) would have its own unique article_id.

- **Product Code**: This is also a unique identifier, but it is associated with a specific product or style rather than individual articles. It represents a broader category or type of product within H&M's inventory. Multiple articles may share the same product code if they belong to the same product line or style.

While both are unique identifiers, the article_id is specific to individual items, whereas the product_code represents a broader category or style of product.

Here is an example:

**Product: Basic T-Shirt**

- **Product Code:** TS001

- **Article IDs:**
    - Article ID: 1001 (Size: Small, Color: White)
    - Article ID: 1002 (Size: Medium, Color: White)
    - Article ID: 1003 (Size: Large, Color: White)
    - Article ID: 1004 (Size: Small, Color: Black)
    - Article ID: 1005 (Size: Medium, Color: Black)

In this example, "TS001" is the product code for the basic t-shirt style. Each variant of this t-shirt (e.g., different sizes and colors) has its own unique article_id.



In [None]:
# Start the timer
start_time = time.time()


# Load articles data
articles_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/articles.csv', try_parse_dates=True)
print(articles_df.shape)
articles_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

In [None]:
# Check for NaNs
articles_df.null_count()

## <span style="color:#ff5f27">üë®üèª‚Äçüè≠ Articles Feature Engineering</span>


In [None]:
# Start the timer
start_time = time.time()


articles_df = compute_features_articles(articles_df)
articles_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

In [None]:
print(articles_df['article_description'][0])

## <span style="color:#ff5f27">üß¨ Embeddings Creation</span>

In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

# Load the embedding model.
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
articles_df = generate_embeddings_for_dataframe(
    articles_df, "article_description", model, batch_size=128
)  # Reduce batch size if getting OOM errors.

In [None]:
articles_df[['article_description', 'embeddings']].head(3)

## <span style="color:#ff5f27">üîó Image Links</span>

In [None]:
articles_df["image_url"][0]

In [None]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

---
## <span style="color:#ff5f27">üóÑÔ∏è Read Customers Data</span>

In [None]:
# Start the timer
start_time = time.time()


# Load customers data
customers_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/customers.csv', try_parse_dates=True)
print(customers_df.shape)
customers_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

## <span style="color:#ff5f27">üë®üèª‚Äçüè≠ Customers Feature Engineering</span>


In [None]:
# Start the timer
start_time = time.time()


customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

---
## <span style="color:#ff5f27">üóÑÔ∏è Read Transactions Data</span>

In [None]:
# Start the timer
start_time = time.time()


transactions_df = pl.read_csv('https://repo.hops.works/dev/jdowling/h-and-m/transactions_train.csv', try_parse_dates=True)
print(transactions_df.shape)
transactions_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

## <span style="color:#ff5f27">üë®üèª‚Äçüè≠ Transactions Feature Engineering</span>

The time of the year a purchase was made should be a strong predictor, as seasonality plays a big factor in fashion purchases. Here, you will use the month of the purchase as a feature. Since this is a cyclical feature (January is as close to December as it is to February), you'll map each month to the unit circle using sine and cosine.

In [None]:
# Start the timer
start_time = time.time()


transactions_df = compute_features_transactions(transactions_df)
transactions_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

## <span style="color:#ff5f27">‚úÇÔ∏è Data Sampling</span>


In [None]:
sampler = DatasetSampler(size=CUSTOMER_DATA_SIZE)
dataset_subset = sampler.sample(
    customers_df=customers_df, transations_df=transactions_df
)
customers_df = dataset_subset["customers"]
transactions_df = dataset_subset["transactions"]

---

## <span style="color:#ff5f27">ü§≥üèª Interaction Data</span>


In [None]:
# Generate the interaction data
interaction_df = generate_interaction_data(transactions_df)

print(interaction_df.shape)
interaction_df.head()

In [None]:
interaction_df.group_by('interaction_score').agg(pl.count('interaction_score').alias('total_interactions'))

Here is what each score means:

- `0` : No interaction between a customer and an item
- `1` : A customer clicked an item
- `2` : A customer bought an item

---

## <span style="color:#ff5f27">ü™Ñ Feature Group Creation </span>

A [feature group](https://docs.hopsworks.ai/feature-store-api/latest/generated/feature_group/) can be seen as a collection of conceptually related features.

Before you can create a feature group you need to connect to your feature store.

To create a feature group you need to give it a name and specify a primary key. It is also good to provide a description of the contents of the feature group.

### <span style="color:#ff5f27">‚õ≥Ô∏è Customers </span>


In [None]:
customers_fg = fs.get_or_create_feature_group(
    name="customers",
    description="Customers data including age and postal code",
    version=1,
    primary_key=["customer_id"],
    online_enabled=True,
)

Here you have also set `online_enabled=True`, which enables low latency access to the data. A full list of arguments can be found in the [documentation](https://docs.hopsworks.ai/feature-store-api/latest/generated/api/feature_store_api/#create_feature_group).

At this point, you have only specified some metadata for the feature group. It does not store any data or even have a schema defined for the data. To make the feature group persistent you populate it with its associated data using the `insert` method.

In [None]:
customers_fg.insert(customers_df)
print('‚úÖ Done!')

In [None]:
feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "club_member_status", "description": "Membership status of the customer in the club."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "postal_code", "description": "Postal code associated with the customer's address."},
    {"name": "age_group", "description": "Categorized age group of the customer."},
]

for desc in feature_descriptions: 
    customers_fg.update_feature_description(desc["name"], desc["description"])

Let's do the same thing for the rest of the data frames.

### <span style="color:#ff5f27">‚õ≥Ô∏è Transactions </span>


In [None]:
trans_fg = fs.get_or_create_feature_group(
    name="transactions",
    version=1,
    description="Transactions data including customer, item, price, sales channel and transaction date",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    transformation_functions=[month_sin, month_cos],
    event_time="t_dat",
)
trans_fg.insert(transactions_df)
print('‚úÖ Done!')

In [None]:
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the data record."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "price", "description": "Price of the purchased article."},
    {"name": "sales_channel_id", "description": "Identifier for the sales channel."},
    {"name": "year", "description": "Year of the transaction."},
    {"name": "month", "description": "Month of the transaction."},
    {"name": "day", "description": "Day of the transaction."},
    {"name": "day_of_week", "description": "Day of the week of the transaction."},
    {"name": "month_sin", "description": "Sine of the month used for seasonal patterns."},
    {"name": "month_cos", "description": "Cosine of the month used for seasonal patterns."},
]

for desc in feature_descriptions: 
    trans_fg.update_feature_description(desc["name"], desc["description"])

### <span style="color:#ff5f27">‚õ≥Ô∏è Interactions </span>


In [None]:
# Create Interactions Feature Group
interactions_fg = fs.get_or_create_feature_group(
    name="interactions",
    version=1,
    description="Customer interactions with articles including purchases, clicks, and ignores. Used for building recommendation systems and analyzing user behavior.",
    primary_key=["customer_id", "article_id"],
    online_enabled=True,
    event_time="t_dat",
)

# Insert the data
interactions_fg.insert(interaction_df)
print('‚úÖ Done!')

In [None]:
# Define feature descriptions for interactions
feature_descriptions = [
    {"name": "t_dat", "description": "Timestamp of the interaction."},
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the article that was interacted with."},
    {"name": "interaction_score", "description": "Type of interaction: 0 = ignore, 1 = click, 2 = purchase."},
    {"name": "prev_article_id", "description": "Previous article that the customer interacted with, useful for sequential recommendation patterns."}
]

# Update feature descriptions
for desc in feature_descriptions:
    interactions_fg.update_feature_description(desc["name"], desc["description"])

### <span style="color:#ff5f27">‚õ≥Ô∏è Articles </span>


In [None]:
from hsfs.feature import Feature

features = [
    Feature(name='article_id', type='string', description="Identifier for the article."),
    Feature(name='product_code', type='bigint', description="Code associated with the product."),
    Feature(name='prod_name', type='string', description="Name of the product."),
    Feature(name='product_type_no', type='bigint', description="Number associated with the product type."),
    Feature(name='product_type_name', type='string', description="Name of the product type."),
    Feature(name='product_group_name', type='string', description="Name of the product group."),
    Feature(name='graphical_appearance_no', type='bigint', description="Number associated with graphical appearance."),
    Feature(name='graphical_appearance_name', type='string', description="Name of the graphical appearance."),
    Feature(name='colour_group_code', type='bigint', description="Code associated with the colour group."),
    Feature(name='colour_group_name', type='string', description="Name of the colour group."),
    Feature(name='perceived_colour_value_id', type='bigint', description="ID associated with perceived colour value."),
    Feature(name='perceived_colour_value_name', type='string', description="Name of the perceived colour value."),
    Feature(name='perceived_colour_master_id', type='bigint', description="ID associated with perceived colour master."),
    Feature(name='perceived_colour_master_name', type='string', description="Name of the perceived colour master."),
    Feature(name='department_no', type='bigint', description="Number associated with the department."),
    Feature(name='department_name', type='string', description="Name of the department."),
    Feature(name='index_code', type='string', description="Code associated with the index."),
    Feature(name='index_name', type='string', description="Name of the index."),
    Feature(name='index_group_no', type='bigint', description="Number associated with the index group."),
    Feature(name='index_group_name', type='string', description="Name of the index group."),
    Feature(name='section_no', type='bigint', description="Number associated with the section."),
    Feature(name='section_name', type='string', description="Name of the section."),
    Feature(name='garment_group_no', type='bigint', description="Number associated with the garment group."),
    Feature(name='garment_group_name', type='string', description="Name of the garment group."),
    Feature(name='prod_name_length', type='bigint', description="Length of the product name."),
    Feature(name='article_description', type='string', online_type="VARCHAR(5800)", description="Description of the article."),
    Feature(name='embeddings', type='array<double>', description="Vector embeddings of the article description."),
    Feature(name='image_url', type='string', description="URL of the product image."),
]

In [None]:
from hsfs import embedding

# Create the Embedding Index
emb = embedding.EmbeddingIndex()

emb.add_embedding(
    "embeddings", 
    model.get_sentence_embedding_dimension(),
)

In [None]:
articles_fg = fs.get_or_create_feature_group(
    name="articles",
    version=1,
    description="Fashion items data including type of item, visual description and category",
    primary_key=["article_id"],
    online_enabled=True,
    features=features,
    embedding_index=emb,
)
articles_fg.insert(
    articles_df,
    write_options={"wait_for_job": True},
)
print('‚úÖ Done!')

## <span style="color:#ff5f27">üìä Ranking Dataset </span>


In [None]:
# Start the timer
start_time = time.time()

In [None]:
ranking_df = compute_ranking_dataset(
    trans_fg,
    articles_fg,
    customers_fg,
)
ranking_df.head(3)

In [None]:
# End the timer
end_time = time.time()

# Calculate and print the execution time
execution_time = end_time - start_time
print(f"‚åõÔ∏è Execution time: {execution_time:.2f} seconds")

In [None]:
ranking_df.get_column("label").value_counts()

In [None]:
rank_fg = fs.get_or_create_feature_group(
    name="ranking",
    version=1,
    description="Derived feature group for ranking",
    primary_key=["customer_id", "article_id"], 
    parents=[articles_fg, customers_fg, trans_fg],
)
rank_fg.insert(ranking_df)
print('‚úÖ Done!')

In [None]:
ranking_feature_descriptions = [
    {"name": "customer_id", "description": "Unique identifier for each customer."},
    {"name": "article_id", "description": "Identifier for the purchased article."},
    {"name": "age", "description": "Age of the customer."},
    {"name": "product_type_name", "description": "Name of the product type."},
    {"name": "product_group_name", "description": "Name of the product group."},
    {"name": "graphical_appearance_name", "description": "Name of the graphical appearance."},
    {"name": "colour_group_name", "description": "Name of the colour group."},
    {"name": "perceived_colour_value_name", "description": "Name of the perceived colour value."},
    {"name": "perceived_colour_master_name", "description": "Name of the perceived colour master."},
    {"name": "department_name", "description": "Name of the department."},
    {"name": "index_name", "description": "Name of the index."},
    {"name": "index_group_name", "description": "Name of the index group."},
    {"name": "section_name", "description": "Name of the section."},
    {"name": "garment_group_name", "description": "Name of the garment group."},
    {"name": "label", "description": "Label indicating whether the article was purchased (1) or not (0)."},
]

You should now be able to inspect the feature groups in the Hopsworks UI.

---

In [None]:
# End the timer
notebook_end_time = time.time()

# Calculate and print the execution time
notebook_execution_time = notebook_end_time - notebook_start_time
print(f"‚åõÔ∏è Notebook Execution time: {notebook_execution_time:.2f} seconds")

---
## <span style="color:#ff5f27">‚è©Ô∏è Next Steps </span>
In the next notebook you'll train a retrieval model.