In [1]:
# Importing libraries
import random
import pickle

import pandas as pd
import numpy as np

from os import listdir
from os import cpu_count
import pyarrow.parquet as pq

from typing import List
from toolz.functoolz import pipe
from gensim.models import Word2Vec

In [2]:
def import_data(data_dir: str) -> List[pd.DataFrame]:
    """
    Parameters:
    ----------------
    data_dir: str
      The path where the data is stored

    Returns:
    ----------------
    dataframes_ls: List[pd.DataFrame]
      A list of pandas dataframes
    """
    files = [file.split('.')[0] for file in listdir("../data") if file.split('.')[0] != ""]

    # Creating a string expression to evaluate the data
    eval_expr = ', '.join(f'pd.read_csv(\'../data/{file}.csv\')' for file in files)

    # Evaluating the expression and assigning it, which creates a list of dataframes
    dataframes_ls = eval(eval_expr)

    return dataframes_ls

In [3]:
dataframes = import_data("../data")
files = [file.split('.')[0] for file in listdir("../data")]
files_dict = dict(zip(files, range(len(files))))
products = dataframes[files_dict['products']]

In [4]:
cluster_data = pq.read_table('./dummy_k13.parquet').to_pandas()

In [5]:
cluster_data_named = pd.merge(cluster_data, products, on='product_id', how='inner')

In [6]:
cluster_data_named['product_id'] = cluster_data_named['product_id'].astype(str)

In [7]:
def filter_data_by_cluster(data: pd.DataFrame, cluster_num: int):
    return data.loc[data['cluster'] == cluster_num, :]

In [8]:
clusters_separated = [filter_data_by_cluster(cluster_data_named, cluster_num) for cluster_num in range(0, len(cluster_data_named['cluster'].unique()))]

In [9]:
product_lookup = dict(zip(products['product_id'].astype(str).to_list(), products['product_name'].to_list()))

In [29]:
with open('product_lookup.pkl', 'wb') as file:
    pickle.dump(product_lookup, file)

In [10]:
unique_users_per_cluster = [clusters_separated[i]['user_id'].unique().tolist() for i in range(0, len(clusters_separated))]

In [11]:
def return_purchases_per_user(data: pd.DataFrame, user_id: int):
    return data[data['user_id'] == user_id]['product_id'].tolist()

In [12]:
def generate_user_purchase_history_in_cluster(cluster_index: int):
    return [return_purchases_per_user(clusters_separated[cluster_index], user_id) for user_id in unique_users_per_cluster[cluster_index]]

In [13]:
purchase_history_in_cluster = [generate_user_purchase_history_in_cluster(cluster_index) for cluster_index in range(0, len(clusters_separated))]

In [17]:
def build_item2vec_model(purchases_data):

    model = Word2Vec(window = 12, sg = 1, hs = 0, negative = 10, alpha=0.03,min_alpha=0.0007, seed = 28101997, workers=4)

    model.build_vocab(purchases_data, progress_per=200)

    model.train(purchases_data, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

    return model.wv

In [18]:
models = [build_item2vec_model(purchase_history) for purchase_history in purchase_history_in_cluster]

In [23]:
def save_word_vectors(item_vectors, id: int):
    item_vectors.save(f'../item_vectors/item_vectors_cluster_{id}.kv')
    return f"Item vectors for cluster {id} successfully saved."

In [24]:
[save_word_vectors(models[i], i) for i in range(len(models))]

['Item vectors for cluster 0 successfully saved.',
 'Item vectors for cluster 1 successfully saved.',
 'Item vectors for cluster 2 successfully saved.',
 'Item vectors for cluster 3 successfully saved.',
 'Item vectors for cluster 4 successfully saved.',
 'Item vectors for cluster 5 successfully saved.',
 'Item vectors for cluster 6 successfully saved.',
 'Item vectors for cluster 7 successfully saved.',
 'Item vectors for cluster 8 successfully saved.',
 'Item vectors for cluster 9 successfully saved.',
 'Item vectors for cluster 10 successfully saved.',
 'Item vectors for cluster 11 successfully saved.',
 'Item vectors for cluster 12 successfully saved.']