# Embeddings for Recommendation systems

In [1]:
import sys
sys.path.append('D:/source/repos')
from utilities.std_imports import *
import random as rnd
from tqdm import tqdm
from gensim.models import Word2Vec 
import warnings
warnings.filterwarnings('ignore')

In [None]:
import umap

### Load data

In [26]:
df = pd.read_excel("D:/data/csv/supply_chain/online_retail.xlsx")
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### Data cleansing

In [27]:
df.dropna(inplace=True)
df['StockCode']= df['StockCode'].astype(str)
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

### Data preparation 
90% of customers

In [28]:
rnd.shuffle(customers)
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

train_ds = df[df['CustomerID'].isin(customers_train)]
valid_ds = df[~df['CustomerID'].isin(customers_train)]

### Capture purchase histories

In [29]:
purch_train = []
for i in tqdm(customers_train):
    purch_cust = train_ds[train_ds["CustomerID"] == i]["StockCode"].tolist()
    purch_train.append(purch_cust)
    
purch_valid = []
for i in tqdm(valid_ds['CustomerID'].unique()):
    purch_cust = valid_ds[valid_ds["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(purch_cust)

100%|█████████████████████████████████████████████████████████████████████████████| 3935/3935 [00:04<00:00, 907.18it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 437/437 [00:00<00:00, 1331.73it/s]


### Embeddings for products

In [30]:
model = Word2Vec(window = 10, sg = 1, hs = 0, negative = 10,  alpha=0.03, min_alpha=0.0007, seed = 14)
model.build_vocab(purch_train, progress_per=200)
model.train(purch_train, total_examples = model.corpus_count, epochs=10, report_delay=1)

(3620161, 3656290)

In [19]:
model.init_sims(replace=True)
print(model)
X = model[model.wv.vocab]
X.shape

Word2Vec(vocab=3183, size=100, alpha=0.03)


(3183, 100)

### Visualisation

In [31]:
cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42).fit_transform(X)
plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

AttributeError: module 'umap' has no attribute 'UMAP'

### Generate Recommendations

In [None]:
products = train_df[["StockCode", "Description"]]
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

### Functions

In [None]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms        

def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [None]:
products_dict['84029E']

In [None]:
similar_products(model['90019A'])

In [None]:
similar_products(aggregate_vectors(purchases_val[0]))

### Credits & Links

https://www.analyticsvidhya.com/blog/2019/07/how-to-build-recommendation-system-word2vec-python/  
https://stackoverflow.com/questions/57242208/how-to-resolve-the-error-module-umap-has-no-attribute-umap-i-tried-installi