In [1]:
# !pip install --upgrade uv
# !uv pip install --all-extras --system --requirement pyproject.toml

In [2]:
import sys, warnings
from pathlib import Path
root_dir = str(Path().absolute().parent)
if root_dir not in sys.path:
    print(f"Adding the following directory to the PYTHONPATH: {root_dir}")
    sys.path.append(root_dir)

Adding the following directory to the PYTHONPATH: /Users/1zuu/Library/Mobile Documents/com~apple~CloudDocs/ML STU/AI in Production/00-decoding-ml-tiktok-rec-system


In [3]:
import torch
import polars as pl
from pprint import pprint
from loguru import logger
from recsys.config import settings
from sentence_transformers import SentenceTransformer
from recsys.features.articles import (
                                    compute_features_articles,
                                    generate_embeddings_for_dataframe,
                                    )
from recsys.features.customers import DatasetSampler, compute_features_customers
from recsys.features.transactions import compute_features_transactions
from recsys.features.interaction import generate_interaction_data
from recsys.raw_data_sources import h_and_m as h_and_m_raw_data
from recsys.features.ranking import compute_ranking_dataset
from recsys.hopsworks_integration import feature_store
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


### 0-Parameters

In [4]:
pprint(dict(settings))

{'CUSTOMER_DATA_SIZE': <CustomerDatasetSize.SMALL: 'SMALL'>,
 'CUSTOM_HOPSWORKS_INFERENCE_ENV': 'custom_env_name',
 'FEATURES_EMBEDDING_MODEL_ID': 'all-MiniLM-L6-v2',
 'GITHUB_TOKEN': SecretStr('**********'),
 'HOPSWORKS_API_KEY': SecretStr('**********'),
 'OPENAI_API_KEY': SecretStr('**********'),
 'OPENAI_MODEL_ID': 'gpt-4o-mini',
 'RANKING_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'RANKING_EARLY_STOPPING_ROUNDS': 5,
 'RANKING_ITERATIONS': 100,
 'RANKING_LEARNING_RATE': 0.2,
 'RANKING_MODEL_TYPE': 'ranking',
 'RANKING_SCALE_POS_WEIGHT': 10,
 'RECSYS_DIR': PosixPath('/Users/1zuu/Library/Mobile Documents/com~apple~CloudDocs/ML STU/AI in Production/00-decoding-ml-tiktok-rec-system/recsys'),
 'TWO_TOWER_DATASET_TEST_SPLIT_SIZE': 0.1,
 'TWO_TOWER_DATASET_VALIDATON_SPLIT_SIZE': 0.1,
 'TWO_TOWER_LEARNING_RATE': 0.01,
 'TWO_TOWER_MODEL_BATCH_SIZE': 2048,
 'TWO_TOWER_MODEL_EMBEDDING_SIZE': 16,
 'TWO_TOWER_NUM_EPOCHS': 10,
 'TWO_TOWER_WEIGHT_DECAY': 0.001}


In [5]:
DatasetSampler.get_supported_sizes()

{<CustomerDatasetSize.LARGE: 'LARGE'>: 50000,
 <CustomerDatasetSize.MEDIUM: 'MEDIUM'>: 5000,
 <CustomerDatasetSize.SMALL: 'SMALL'>: 1000}

### 1-Connecting to Hopswork

In [6]:
project, fs = feature_store.get_feature_store()

[32m2025-01-12 09:38:05.258[0m | [1mINFO    [0m | [36mrecsys.hopsworks_integration.feature_store[0m:[36mget_feature_store[0m:[36m12[0m - [1mLoging to Hopsworks using HOPSWORKS_API_KEY env var.[0m


2025-01-12 09:38:05,261 INFO: Initializing external client
2025-01-12 09:38:05,261 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-12 09:38:12,398 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1209499


In [7]:
project

Project('mlopsrecsys', 'isurualagiyawanna97@gmail.com', 'Default project')

In [8]:
fs

<hsfs.feature_store.FeatureStore at 0x32ef74550>

### 2-Analyze H&M Data

#### Article Data

In [9]:
articles_df = h_and_m_raw_data.extract_articles_df()
articles_df.shape

(105542, 25)

In [10]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
i64,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,str
108775015,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775044,108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"
108775051,108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""","""Jersey top with narrow shoulde…"


In [11]:
articles_df.null_count()

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,416


#### Customer Data

In [18]:
customers_df = h_and_m_raw_data.extract_customers_df()
customers_df.shape

(1371980, 7)

In [19]:
customers_df.head(3)

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
str,f64,f64,str,str,i64,str
"""00000dbacae5abe5e23885899a1fa4…",,,"""ACTIVE""","""NONE""",49,"""52043ee2162cf5aa7ee79974281641…"
"""0000423b00ade91418cceaf3b26c6a…",,,"""ACTIVE""","""NONE""",25,"""2973abc54daa8a5f8ccfe9362140c6…"
"""000058a12d5b43e67d225668fa1f8d…",,,"""ACTIVE""","""NONE""",24,"""64f17e6a330a85798e4998f62d0930…"


In [20]:
customers_df.null_count()

customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
u32,u32,u32,u32,u32,u32,u32
0,895050,907576,6062,16009,15861,0


#### Transactions Data

In [21]:
transactions_df = h_and_m_raw_data.extract_transactions_df()
transactions_df.shape

(31788324, 5)

In [None]:
transactions_df.head(3)

### Feature Engineering

In [12]:
articles_df = compute_features_articles(articles_df)
articles_df.shape

(105542, 27)

In [13]:
articles_df.head(3)

article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,prod_name_length,article_description,image_url
str,i64,str,i64,str,str,i64,str,i64,str,i64,str,i64,str,i64,str,str,str,i64,str,i64,str,i64,str,u32,str,str
"""108775015""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",9,"""Black""",4,"""Dark""",5,"""Black""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""https://repo.hops.works/dev/jd…"
"""108775044""",108775,"""Strap top""",253,"""Vest top""","""Garment Upper body""",1010016,"""Solid""",10,"""White""",3,"""Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",9,"""Strap top - Vest top in Garmen…","""https://repo.hops.works/dev/jd…"
"""108775051""",108775,"""Strap top (1)""",253,"""Vest top""","""Garment Upper body""",1010017,"""Stripe""",11,"""Off White""",1,"""Dusty Light""",9,"""White""",1676,"""Jersey Basic""","""A""","""Ladieswear""",1,"""Ladieswear""",16,"""Womens Everyday Basics""",1002,"""Jersey Basic""",13,"""Strap top (1) - Vest top in Ga…","""https://repo.hops.works/dev/jd…"


In [14]:
for i, desc in enumerate(articles_df["article_description"].head(n=3)):
    logger.info(f"Item {i+1}:\n{desc}")

[32m2025-01-12 09:39:55.126[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 1:
Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Dark Black (Black)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.[0m
[32m2025-01-12 09:39:55.127[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 2:
Strap top - Vest top in Garment Upper body
Appearance: Solid
Color: Light White (White)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.[0m
[32m2025-01-12 09:39:55.127[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m2[0m - [1mItem 3:
Strap top (1) - Vest top in Garment Upper body
Appearance: Stripe
Color: Dusty Light White (Off White)
Category: Ladieswear - Womens Everyday Basics - Jersey Basic
Details: Jersey top with narrow shoulder straps.[0m


In [15]:
device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
        )
logger.info(
        f"Loading '{settings.FEATURES_EMBEDDING_MODEL_ID}' embedding model to {device=}"
        )

# Load the embedding model from SentenceTransformer's model registry.
embed_model = SentenceTransformer(settings.FEATURES_EMBEDDING_MODEL_ID, device=device)

[32m2025-01-12 09:39:55.961[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m8[0m - [1mLoading 'all-MiniLM-L6-v2' embedding model to device='mps'[0m


2025-01-12 09:39:55,962 INFO: Load pretrained SentenceTransformer: all-MiniLM-L6-v2


In [16]:
articles_df = generate_embeddings_for_dataframe(
                                                articles_df, "article_description", 
                                                embed_model, 
                                                batch_size=128
                                                )  # Reduce batch size if getting OOM errors.
articles_df[["article_description", "embeddings"]].head(3)

Generating embeddings: 100%|██████████| 105542/105542 [05:17<00:00, 332.74it/s]


article_description,embeddings
str,list[f64]
"""Strap top - Vest top in Garmen…","[-0.026782, 0.082344, … 0.022782]"
"""Strap top - Vest top in Garmen…","[-0.010396, 0.089874, … 0.022564]"
"""Strap top (1) - Vest top in Ga…","[-0.032753, 0.091124, … 0.022804]"


In [17]:
articles_df["image_url"][0]

'https://repo.hops.works/dev/jdowling/h-and-m/images/010/0108775015.jpg'

In [22]:
from IPython.display import HTML, display

image_urls = articles_df["image_url"].tail(12).to_list()
grid_html = '<div style="display: grid; grid-template-columns: repeat(6, 1fr); gap: 10px; max-width: 900px;">'

for url in image_urls:
    grid_html += f'<img src="{url}" style="width: 100%; height: auto;">'

grid_html += "</div>"

display(HTML(grid_html))

In [None]:
customers_df = compute_features_customers(customers_df, drop_null_age=True)
customers_df.shape