### Importing all required libraries

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm # Instantly make your loops show a smart progress meter - just wrap any iterable with tqdm(iterable)
from gensim.models import Word2Vec
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore') # not showing warnings while executing

### Importing required dataset.

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
df = pd.read_excel(url)
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [3]:
df.shape

(541909, 8)

In [4]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

Since we have sufficient data to work with, we will drop the rows with null values.

In [5]:
df.dropna(axis=0,inplace=True)

### Data Preparation

In order to recommend products to the cutomers we need to know their shopping history. Therefore, we will parse the shopping history of each customer.

In [10]:
customers = df['CustomerID'].unique().tolist()
len(customers)

4372

Since there are 4,372 distinct customers, we will have 4,372 sequences of purchases.

In [11]:
random.shuffle(customers) # shuffling customers in order to get unbiased data

Let's take 90% of data as train data and 10% as validation data

In [12]:
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

Now, splitting data into train an validation set

In [14]:
df['StockCode'] = df['StockCode'].astype(str)

In [15]:
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

Creating sequence of purchases made by the customers in the dataset

In [16]:
purchases_train = []
purchases_val = []

def seq_maker(my_list, my_df, temp_list):
  for i in tqdm(my_list):
    temp = my_df[my_df['CustomerID']==i]['StockCode'].tolist()
    temp_list.append(temp)
    

In [17]:
seq_maker(customers_train,train_df,purchases_train)

100%|██████████| 3935/3935 [00:06<00:00, 590.18it/s]


In [21]:
seq_maker(validation_df['CustomerID'].unique(),validation_df,purchases_val)

100%|██████████| 437/437 [00:00<00:00, 823.47it/s]


 ### Training Word2Vec Model

In [22]:
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3688188, 3724490)

In [23]:
model.init_sims(replace=True)

In [24]:
print(model)

Word2Vec(vocab=3174, size=100, alpha=0.03)


In [25]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(3174, 100)

In [28]:
!pip install umap

Collecting umap
  Downloading https://files.pythonhosted.org/packages/4b/46/08ab68936625400fe690684428d4db4764f49b406782cc133df1d0299d06/umap-0.1.1.tar.gz
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25l[?25hdone
  Created wheel for umap: filename=umap-0.1.1-cp37-none-any.whl size=3568 sha256=05797d5197bfa14dc83a668cd87aa1569db4c5cc288346bb4f21561298fb31e8
  Stored in directory: /root/.cache/pip/wheels/7b/29/33/b4d917dc95f69c0a060e2ab012d95e15db9ed4cc0b94ccac26
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


### Visualize word2vec Embeddings

Reducing the dimensions of product embeddings from 100 to 2 by using the UMAP algorithm.

In [29]:
import umap

cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

AttributeError: ignored

Creating a productID and product-description dictionary to easily map a product's description to its ID and vice versa.

In [30]:
products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [31]:
# test the dictionary
products_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [32]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms       

In [33]:
similar_products(model['90019A'])

[('SILVER M.O.P ORBIT DROP EARRINGS', 0.7636992931365967),
 ('BLACK VINTAGE  CRYSTAL EARRINGS', 0.7490034699440002),
 ('PINK HEART OF GLASS BRACELET', 0.7489266395568848),
 ('PINK BOUDICCA LARGE BRACELET', 0.741625964641571),
 ('AMBER DROP EARRINGS W LONG BEADS', 0.7349154353141785),
 ('DROP DIAMANTE EARRINGS PURPLE', 0.72665935754776)]

In [34]:

def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [35]:
len(purchases_val[0])

30

In [36]:
aggregate_vectors(purchases_val[0]).shape

(100,)

In [37]:
similar_products(aggregate_vectors(purchases_val[0]))

[('JUMBO BAG APPLES', 0.8005504608154297),
 ('JUMBO BAG ALPHABET', 0.7930920124053955),
 ('JUMBO BAG PEARS', 0.7872057557106018),
 ('JUMBO BAG VINTAGE LEAF', 0.7867273092269897),
 ('LUNCH BAG ALPHABET DESIGN', 0.7723840475082397),
 ('JUMBO BAG DOLLY GIRL DESIGN', 0.7589294910430908)]

In [38]:
similar_products(aggregate_vectors(purchases_val[0][-10:]))

[('JUMBO BAG VINTAGE DOILY ', 0.749992847442627),
 ('JUMBO BAG VINTAGE CHRISTMAS ', 0.7334388494491577),
 ('JUMBO BAG VINTAGE LEAF', 0.6987371444702148),
 ('VINTAGE DOILY JUMBO BAG RED ', 0.6882349252700806),
 ('JUMBO BAG PEARS', 0.6790704727172852),
 ('JINGLE BELL HEART DECORATION', 0.6738508939743042)]