# Create document embeddings
We'll use the doc2vec module from gensim to create an embedding matrix that can be used by tensorflow

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from sklearn import utils
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models import Doc2Vec
from gensim.models.doc2vec import FAST_VERSION
from gensim.models.doc2vec import TaggedDocument

  from pandas import Panel


### Load in the data

In [2]:
# Load in the data
# data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
data_list = [a] = [None]
data_location = '../Datasets/AmazonCat-13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'first_pass_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

# Delete unused var (to save memory)
del data_list

In [3]:
# Convert the labels from string to array (return unique values only)
data['labels'] = data['labels'].apply(lambda labels: list(set(ast.literal_eval(labels))))

In [4]:
# Drop rows with missing values
data = data.dropna(axis=0)

In [5]:
# Create function to join title and description
def join_title_and_description(row):
    return f'{row["title"]} {row["description"]}'

In [6]:
# Create a new column that combines the title and description
data['title_and_description'] = data.apply(lambda row: join_title_and_description(row), axis=1)

In [7]:
# Drop title and description columns (to save memory)
data = data.drop(labels=['title', 'description'], axis=1)

In [8]:
# Have a look at the shape
data.shape

(149446, 4)

In [9]:
# Have a look at the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,labels,title_and_description
0,0,ID:B0027DQHA0,"[Classical, Movies & TV, TV, Music]",Sao Paulo Samba (2008) Conducted by John Nesch...
1,1,ID:0756400120,"[United States, Anthologies & Literary Collect...",Past Imperfect (Daw Book Collectors) This fast...
2,2,ID:B00024YAOQ,"[Business & Investing, Motivation & Self-Impro...",Winning Every Time: How to Use the Skills of a...


### Tokenize the data

In [10]:
# Convert from df to list so it can be processed
text = data['title_and_description'].tolist()

In [11]:
# Tokenize the data
VOCAB_SIZE = 5000 
# VOCAB_SIZE = 200000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text)

In [12]:
# Create a squence from the tokens
data['title_and_description'] = tokenizer.texts_to_sequences(text)

In [13]:
# Delete unused var (to save memory)
del text

In [14]:
# Have a look at the first sequence
print(data['title_and_description'].iloc[0])

[3280, 4417, 19, 239, 214, 1203, 1, 2950, 7, 2916, 19, 55, 3, 1999, 97, 127, 181, 1, 2950, 265, 304, 1, 3701, 9, 16, 1267, 3, 2, 1999, 97, 1474, 131, 498]


In [15]:
# Convert sequence of integers to sequence of tokens so they can be processed by Doc2vec
data['title_and_description'] = data['title_and_description'].apply(lambda seq: [tokenizer.index_word[index] for index in seq])

In [16]:
# Have a look at the first sequence
print(data['title_and_description'].iloc[0])

['2008', 'conducted', 'by', 'john', 'since', '1997', 'the', 'orchestra', 'is', 'defined', 'by', 'its', 'of', 'latin', 'american', 'music', 'here', 'the', 'orchestra', 'yet', 'again', 'the', 'listener', 'with', 'an', 'selection', 'of', 'and', 'latin', 'american', 'classics', 'including', 'w']


### Create the doc2vec vectors

In [17]:
# Check gensim version used
FAST_VERSION

1

In [18]:
# Create the tagged documents
tagged_documents = data.apply(lambda row: TaggedDocument(words=row['title_and_description'], tags=row['labels']), axis=1)

In [19]:
# Have a look at the first tagged document
tagged_documents.values[0]

TaggedDocument(words=['2008', 'conducted', 'by', 'john', 'since', '1997', 'the', 'orchestra', 'is', 'defined', 'by', 'its', 'of', 'latin', 'american', 'music', 'here', 'the', 'orchestra', 'yet', 'again', 'the', 'listener', 'with', 'an', 'selection', 'of', 'and', 'latin', 'american', 'classics', 'including', 'w'], tags=['Classical', 'Movies & TV', 'TV', 'Music'])

In [20]:
# Define the doc2vec model
doc2vec_dbow = Doc2Vec(dm=0,            # 0 for Distributed Bag of Words, 1 for Distributed Memory
                       vector_size=200,  # Dimensionality of the feature vectors
                       window=5,        # The maximum distance between the current and predicted word within a sentence
                       negative=5,      # Specifies how many 'noise words' should be drawn
                       hs=0,            # If 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used.
                       min_count=2,     # Ignores all words with total frequency lower than this
                       sample=0,        # The threshold for configuring which higher-frequency words are randomly downsampled
                       alpha=0.025,     # Set the starting learning rate
                       min_alpha=0.025, # Set the minimum learning rate (linear decay)
                       workers=12)      # Use these many worker threads to train the model

In [21]:
# Build the vocabulary
doc2vec_dbow.build_vocab(tagged_documents.values)

In [22]:
# Train the model
for epoch in range(10):
    print(f'Training epoch {epoch + 1}')
    doc2vec_dbow.train(tagged_documents,
                       total_examples=len(tagged_documents.values),
                       epochs=1)
    doc2vec_dbow.alpha -= 0.002 # decrease the learning rate
    doc2vec_dbow.min_alpha = doc2vec_dbow.alpha # fix the learning rate, no decay

Training epoch 1
Training epoch 2
Training epoch 3
Training epoch 4
Training epoch 5
Training epoch 6
Training epoch 7
Training epoch 8
Training epoch 9
Training epoch 10


In [32]:
doc2vec_dbow.docvecs.similarity('World War II', 'France')

0.4677675

In [33]:
inferred_vector = doc2vec_dbow.infer_vector(['book', 'about', 'space', 'pirates', 'and', 'action'])

In [25]:
doc2vec_dbow.docvecs.most_similar([inferred_vector], topn=len(doc2vec_dbow.docvecs))

[('Pickle & Olive Forks', 0.6003987193107605),
 ('Differential Covers', 0.5919567346572876),
 ('Egg Noodles', 0.5796162486076355),
 ('Gun Stock Accessories', 0.5733641386032104),
 ('Intake & Exhaust Manifold', 0.5717841386795044),
 ('Technology', 0.5666373372077942),
 ('Printer Tractors', 0.5610466003417969),
 ('Gay & Lesbian', 0.5606696605682373),
 ('Desk Caddies', 0.5597009658813477),
 ('Historical Study & Educational Resources', 0.5571966171264648),
 ('Art Knives & Blades', 0.5563797950744629),
 ('Buffers & Polishing Machine Backing Plates', 0.5561999082565308),
 ('England', 0.5561740398406982),
 ('Lighting Products', 0.5549966096878052),
 ('History & Theory', 0.5534413456916809),
 ('History', 0.5528998374938965),
 ('French Horn', 0.5528833866119385),
 ('Ideologies & Doctrines', 0.5519598722457886),
 ('Flowtron', 0.5517383217811584),
 ('Psychology', 0.550269603729248),
 ('Bait Traps', 0.5501764416694641),
 ('Handguards', 0.5489562749862671),
 ('Economics', 0.5477913618087769),
 ('Ma

In [35]:
string = 'Data is at the center of many challenges in system design today. Difficult issues need to be figured out, such as scalability, consistency, reliability, efficiency, and maintainability. In addition, we have an overwhelming variety of tools, including relational databases, NoSQL datastores, stream or batch processors, and message brokers. What are the right choices for your application? How do you make sense of all these buzzwords?'

In [61]:
test = ['2008', 'conducted', 'by', 'john', 'since', '1997', 'the', 'orchestra', 'is', 'defined', 'by', 'its', 'of', 'latin', 'american', 'music', 'here', 'the', 'orchestra', 'yet', 'again', 'the', 'listener', 'with', 'an', 'selection', 'of', 'and', 'latin', 'american', 'classics', 'including', 'w']

In [62]:
inferred_vector = doc2vec_dbow.infer_vector(test)

In [66]:
doc2vec_dbow.docvecs.most_similar([inferred_vector], topn=5)

[('Differential Covers', 0.629446268081665),
 ('French Horn', 0.6287521123886108),
 ('Egg Noodles', 0.6200672388076782),
 ('Pickle & Olive Forks', 0.6024436354637146),
 ('Technology', 0.5929907560348511)]

In [64]:
inferred_vector

array([-6.04487285e-02, -3.31343301e-02,  3.64064658e-03, -2.53428109e-02,
       -8.07410032e-02, -1.17610684e-02,  5.98636866e-02,  1.87528580e-02,
        2.81017385e-02,  1.29968971e-02, -4.64846008e-02,  8.35126489e-02,
       -6.01920532e-03, -2.71061249e-03,  1.74348727e-02, -2.33702902e-02,
        2.66083274e-02, -2.86172535e-02, -3.99321988e-02,  3.08322292e-02,
        2.42567249e-03,  3.48604769e-02, -1.95932984e-02, -8.44594557e-03,
        1.10307019e-02,  1.67536903e-02,  2.17582379e-02, -4.95163724e-03,
       -1.96464211e-02, -7.18877697e-03, -2.86695361e-02, -9.15339589e-03,
       -6.62243506e-03, -1.81548987e-02,  5.36562242e-02,  9.67057701e-03,
       -2.43626442e-02, -2.64515337e-02, -9.27808043e-03, -1.99757721e-02,
        5.44959586e-03,  1.22241266e-02,  3.26064117e-02,  2.35693785e-03,
       -1.42027205e-02, -2.56951880e-02, -1.02715138e-02, -5.80381695e-03,
        3.35869193e-02, -2.30997968e-02, -1.01286313e-02,  4.33885753e-02,
       -3.49828624e-03, -

In [65]:
doc2vec_dbow.docvecs[0]

array([-1.69314355e-01, -2.59943187e-01, -1.90322310e-01,  2.64214985e-02,
       -4.60443467e-01, -5.03906328e-03,  1.74827445e-02,  4.56340797e-02,
        3.52577455e-02, -2.63350815e-01, -3.13535333e-01,  1.98431641e-01,
        1.14055917e-01,  1.28192201e-01,  2.96366569e-02, -5.83376596e-03,
        3.49552155e-01, -1.62424564e-01, -1.13570139e-01, -1.63689151e-01,
        1.68875054e-01,  9.77466702e-02, -3.34290802e-01, -1.31658092e-01,
        5.51883459e-01,  4.61896598e-01, -3.62968668e-02, -1.32016793e-01,
       -1.07890435e-01,  5.73066533e-01,  3.29779267e-01,  1.60234466e-01,
       -8.24658945e-02, -2.14702040e-01, -1.19132034e-01, -8.40839520e-02,
       -2.99028140e-02, -2.02851087e-01,  1.42127939e-03,  1.01751886e-01,
       -7.03828186e-02,  2.79903561e-01, -1.00978449e-01,  1.21889077e-01,
       -2.69506633e-01, -3.55726928e-01, -8.84919539e-02,  2.81673640e-01,
       -3.29246446e-02,  6.74620196e-02,  1.60357088e-01,  1.21326394e-01,
       -2.57394295e-02, -