Updated: Jul18th, 2022

In [1]:
from hmcollab import datasets
from hmcollab import articles
from hmcollab import transactions
from hmcollab import models

In [2]:
import math
import pandas as pd
from sklearn.cluster import KMeans
from IPython.display import Image, display
from datetime import datetime, timedelta

In [3]:
# note that display_articles is having issues finding some images
def display_articles(dataset, articles):
    # articles is a list of article_ids
    for article_id in articles:
        filename = dataset.tree.image(article_id)
        display(Image(filename, width = 300, height = 100))


## Toy dataset

In [4]:
toy = datasets.HMDataset(toy=True)
print('Transactions shape: ', toy.transactions.shape)
print('Unique customers: ',toy.transactions.customer_id.unique().shape)
print('train_x shape: ', toy.train_x.shape)
print('train_y shape: ', toy.train_y.shape)
print('test_x shape: ', toy.test_x.shape)
print('test_y shape: ', toy.test_y.shape)
toy_target = datasets.Target(toy.transactions)
print('transactions_x: ', toy_target.transactions_x.shape)
print('transactions_y: ', toy_target.transactions_y.shape)
print('Relevant set: ', toy_target.relevant_set.shape)

Transactions shape:  (252406, 5)
Unique customers:  (10899,)
train_x shape:  (197978, 5)
train_y shape:  (1603, 5)
test_x shape:  (52393, 5)
test_y shape:  (432, 5)
transactions_x:  (250371, 5)
transactions_y:  (2035, 5)
Relevant set:  (578, 2)


In [5]:
toy_dummies = articles.ArticleFeaturesSimpleFeatures(toy.articles, use_article_id=True).x
print('articles: ', toy_dummies.shape)  # (105542, 652)

articles:  (105542, 652)


In [6]:
toy_knn = models.KnnRecommender(toy, toy_dummies)  # already using only train set

In [7]:
toy_knn.t.df.shape

(197978, 5)

In [8]:
toy_train_customer_ids = toy.train_x.customer_id.unique()
print('Unique customers at test transactions file', toy_train_customer_ids.shape)   # (8691,)

Unique customers at test transactions file (8691,)


In [9]:
toy_test_customer_ids = toy.test_x.customer_id.unique()
print('Unique customers at test transactions file', toy_test_customer_ids.shape)   # (2172,)

Unique customers at test transactions file (2172,)


In [10]:
toy.test_x.loc[toy.test_x.customer_id.isin(['041b31a7f12c6ccde9c1f2d9e8fd5cb960cdddffe289b443bb4326a47e506a51']),:].article_id.nunique()

52

In [10]:
# Recommendations for the first customer
toy_recommendations_1st = toy_knn.recommend(toy_train_customer_ids[0])
toy_recommendations_1st

customer_dummies 205


['0399087010',
 '0399136034',
 '0188183001',
 '0377289002',
 '0509843003',
 '0518329007',
 '0712517005',
 '0805571001',
 '0546141001',
 '0608910001',
 '0505071001',
 '0508691009']

In [11]:
# Recommendations for the first customer from test
toy_test_recommendations_1st = toy_knn.recommend(toy_test_customer_ids[0])
toy_test_recommendations_1st

customer_dummies 0
Customer with few obs:  041b31a7f12c6ccde9c1f2d9e8fd5cb960cdddffe289b443bb4326a47e506a51


ValueError: Found array with 0 sample(s) (shape=(0, 651)) while a minimum of 1 is required.

In [None]:
display_articles(toy, toy_recommendations_1st)

In [None]:
# ValueError: n_samples=5 should be >= n_clusters=6  # with groups=6 => Implementend min_k
# Several warnings like below
# /Users/gina/Desktop/Gina/MachineLearning/Proyectos/HM/HM-clothing-project/hmcollab/transactions.py:52: 
# ConvergenceWarning: Number of distinct clusters (4) found smaller than n_clusters (5). 
# Possibly due to duplicate points in X. return kmeans.fit(customer_dummies)
begin = datetime.now()
toy_recommendations = toy_knn.recommend_all(toy_test_customer_ids)
print('Seconds: ', (datetime.now() - begin).total_seconds())
toy_recommendations.head()