In [1]:
!python --version

Python 3.7.11


Datasets
- H&M
 - Goal: predict what article each customer will purchase after training date ends.
 - Consists of articles.csv, customers.csv, transactions_train.csv

References:
- 


- Datasets
    - https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations
    - https://grouplens.org/datasets/movielens/

In [1]:
import numpy as np
import os
import pandas as pd

In [3]:
def convert_csv_to_parquet(folder_path, file):
    """
    Reads csv then convert to parquet file then remove original csv file
    
    parameters
    ---------
    folder_path : str, full path to folder that contain the file
    file : str, file_name with extension
    
    returns
    -------
    nothing
    
    """
    f_name = file.split(".")[0]
    file_path = os.path.join(folder_path, f_name)
    csv_file_path = file_path + ".csv"
    df = pd.read_csv(csv_file_path)
    df.to_parquet(file_path + ".parquet")
    os.remove(csv_file_path)

In [4]:
folder_path = '../datasets/h&m'
data_files = os.listdir(folder_path)
# for f in data_files:
#     convert_csv_to_parquet(folder_path, f)

In [5]:
article_df = pd.read_parquet(os.path.join(folder_path, "articles.parquet"))
customer_df = pd.read_parquet(os.path.join(folder_path, "customers.parquet"))
transaction_df = pd.read_csv(os.path.join(folder_path, "transactions_train.csv"))

In [6]:
print(article_df.shape)
print(customer_df.shape)
print(transaction_df.shape)

(105542, 25)
(1371980, 7)
(31788324, 5)


In [7]:
transaction_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [11]:
transaction_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
article_id,31788324.0,696227200.0,133448000.0,108775000.0,632803000.0,714582000.0,786524000.0,956217000.0
price,31788324.0,0.02782927,0.01918113,1.694915e-05,0.01581356,0.02540678,0.03388136,0.5915254
sales_channel_id,31788324.0,1.704028,0.4564786,1.0,1.0,2.0,2.0,2.0


## Matrix Factorization


1. Create rating matrix

In [8]:
"""
First we must convert interaction data into rating matrix form.
where columns refer to users and rows to items.
each cell is interaction between (user, item) pair

"""

transaction_df_sampled = transaction_df.sample(100)

In [39]:
def get_rating_matrix(df, item_col, user_col, int_col, _type):
    """
    For given interaction data(df) convert it to rating matrix.
     - columns refer to each items
     - rows refer to each users
    
    parameters
    ---------
    user_col : str
    item_col : str
    int_col : str
    _type : str, binary or continuous
            if binary values in int_col become 0 if Nan else 1
    """
    rating_matrix = df.pivot(index=item_col, columns=user_col, values=int_col).values
    if _type == 'binary':
        rating_matrix = np.where(rating_mx>0, 1, rating_mx)
    
    return rating_matrix

In [58]:
rating_mx = get_rating_matrix(transaction_df_sampled, "article_id", "customer_id", "price", _type='binary')

In [57]:
n_pos = np.sum(rating_mx == 1)
n_neg = rating_mx.size - n_pos

print(f"there exists {round(n_pos/(rating_mx.size), 2)}% positive samples")

there exists 0.01% positive samples


In [52]:
rating_mx.shape

(100, 100)

In [53]:
rating_mx.size

10000

In [47]:
pd.DataFrame(rating_mx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,,,,,,1.0,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,1.0,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,,,,,,,,,,,...,,,,,,,,,,
96,,,,,,,,,,,...,,,,,,,,,,
97,,,,,,,,,,,...,1.0,,,,,,,,,
98,,,,,,,,,,,...,,,,,,,,,,


In [12]:
transaction_df_sampled.pivot(index='article_id', columns='customer_id', values='price')

customer_id,002156b708c7c6dd8afe31a743131d13b1e5dcbf2ce8c4a8f9b9ac59a5d9b76f,0054196d5f61e3927c8db8c1435e3a22e6bbdbb0344d310fd2ad6cdf1628b63f,014fc30519ba84643598f805bb48df3c261268deb9c7a84eda7143267374235a,07b5c17f898706e7c17d5ed700bf9512517a110b38f3403ece906934f15230cf,08c091c4b077b780a4dabca25b371c9d8199b5b3e597dc21fe6abaef9c768116,08cce4f6d1c9a8d2c3022e4056c066adaf410c4bb55db423ef9835b6f1c0d475,0948c6b82d12f1376c713c39c1299bbee94206dc9f1ed069a9f1cef6f550520a,095369884d2044e2343ec2a6e5bffbdb6480f74a0fce48fb24904765d617a4fd,0a3917d97ec42a561e0626db2ba1e5641fa6b35ce8355116fe45ed72781fbb8a,0c5ea4fa2353ddbadcf5c22203053a30402973b02300a19fb25a9a4bcfcbcee4,...,e9bf12f7bf9f39b83bb9408c53808b732bf40416773884290101c43436f90350,e9db5f91ebd8ec0de55ba8f2511d12fdd1649ff9304df2280fbea2d01b715d5b,ea1322688905e6ebe467aaa02a0cdb1f63548834a81234a9b2b073cc8425ffa2,ee3d9d5766d4791af87a4ee14c87a865013816925e75dbc0f20c28383d9e526a,f0cbe305b480af7d2ebc2a768f7bb2e56b832c18ac977235fbfac007acc1a1e1,f1878fd12192ac3c431aabb31df27840f3de0c342f0fddbfe0afe6d81a8ca181,f89c015a6a4168a6e867eebe3667248bcb0be600559c5e001498dbfc82c536a8,fa219b0bd5d0f724ad905c7f2e5dabcbe12ca3de037b5cc6e3506d0c98778161,fd74db396b2581b3fbb5054f8464d5846ecde4e36bac01aac5822ccb9aee25c6,fd9953564bf3471c62157ae4db999697bbaf081930d0c99ea6f50d5c73e4da13
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
266875001,,,,,,0.016932,,,,,...,,,,,,,,,,
377277036,,,,,,,,,,,...,,,,,,,,,,
399201005,,,,,,,,,,,...,,,,,0.084729,,,,,
399223035,,,,,,,,,,,...,,,,,,,,,,
448515001,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
876658001,,,,,,,,,,,...,,,,,,,,,,
879781002,,,,,,,,,,,...,,,,,,,,,,
881759003,,,,,,,,,,,...,0.022017,,,,,,,,,
887770002,,,,,,,,,,,...,,,,,,,,,,
