# Create embedding matrix
We'll use the word2vec module from gensim to create an embedding matrix that can be used by tensorflow

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import ast
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models.word2vec import FAST_VERSION
from gensim.models import Word2Vec

### Load in the data

In [2]:
# Load in the data
# data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
data_list = [a] = [None]
data_location = '../Datasets/Amazon-Cat13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'first_pass_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

In [3]:
# Convert the labels from string to array (return unique values only)
data['labels'] = data['labels'].apply(lambda labels: list(set(ast.literal_eval(labels))))

In [4]:
# Drop rows with missing values
data = data.dropna(axis=0)

In [5]:
# Create function to join title and description
def join_title_and_description(row):
    return f'{row["title"]} {row["description"]}'

In [6]:
# Create a new column that combines the title and description
data['title_and_description'] = data.apply(lambda row: join_title_and_description(row), axis=1)

In [7]:
# Have a look at the shape
data.shape

(149446, 6)

In [8]:
# Have a look at the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,title,description,labels,title_and_description
0,0,ID:B0027DQHA0,Sao Paulo Samba (2008),"Conducted by John Neschling since 1997, the or...","[Music, Movies & TV, Classical, TV]",Sao Paulo Samba (2008) Conducted by John Nesch...
1,1,ID:0756400120,Past Imperfect (Daw Book Collectors),"This fast, lightweight anthology of 12 time-tr...","[Short Stories, Science Fiction & Fantasy, Ant...",Past Imperfect (Daw Book Collectors) This fast...
2,2,ID:B00024YAOQ,Winning Every Time: How to Use the Skills of a...,Whether you're hoping to obtain a raise from y...,"[Business Life, Business & Investing, Books, M...",Winning Every Time: How to Use the Skills of a...


### Tokenize the data

In [9]:
# Convert from df to list so it can be processed
text = data['title_and_description'].tolist()

In [10]:
# Tokenize the data
# VOCAB_SIZE = 200000 
VOCAB_SIZE = 20000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text)

In [11]:
# Create a squence from the tokens
sequences = tokenizer.texts_to_sequences(text)

In [12]:
# Have a look at the first sequence
print(sequences[0])

[13001, 3280, 4417, 19, 239, 214, 1203, 1, 2950, 7, 2916, 19, 55, 6547, 3, 1999, 97, 127, 181, 1, 2950, 265, 304, 5592, 1, 3701, 9, 16, 12325, 1267, 3, 6024, 2, 1999, 97, 1474, 131, 498]


In [15]:
# Convert sequence of integers to sequence of tokens so they can be processed by Word2Vec
stringified_sequences = []
# for sequence in padded_sequences:
for sequence in sequences:
    stringified_sequence = []
    for index in sequence:
        stringified_sequence.append(str(index))
    stringified_sequences.append(stringified_sequence)

In [16]:
# Have a look at the first sequence of tokens
print(stringified_sequences[0])

['13001', '3280', '4417', '19', '239', '214', '1203', '1', '2950', '7', '2916', '19', '55', '6547', '3', '1999', '97', '127', '181', '1', '2950', '265', '304', '5592', '1', '3701', '9', '16', '12325', '1267', '3', '6024', '2', '1999', '97', '1474', '131', '498']


### Create the word2vec word vectors
In his original CNN-Kim paper, the author used a pre-trained word2vec embedding developed by Google. They provided the link but it's broken. So, we'll create our own word2vec embeddings for now.

In [17]:
# Check gensim version used
FAST_VERSION

1

In [18]:
# Train the word2vec word vectors (200 dimensions)
EMBEDDING_DIMENSION = 200
word_vectors = Word2Vec(sentences = stringified_sequences,
                        sg = 0, # 0 for continuous bag of words model, 1 for skip-gram model
                        size = EMBEDDING_DIMENSION, # Dimensionality of the word vectors
                        window = 2, # Maximum distance between the current and predicted word within a sentence
                        workers = 12, # Use these many worker threads to train the model 
                        iter = 2) # Run this many time through the dataset

In [19]:
# Check the number of tokens that have been trained for
len(word_vectors.wv.vocab.keys())

19999

In [20]:
# Check the dimensions of each token
word_vectors.wv.vector_size

200

### Convert the word2vec mapping to an embedding matrix
An embedding matrix is the structure that TensorFlow will accept

In [21]:
# Create empty embedding matrix
weight_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIMENSION))

# Fill the matrix with word vectors
for i in range(VOCAB_SIZE - 1):
    weight_matrix[i + 1] = word_vectors.wv[str(i + 1)]

In [22]:
# Check the embedding matrix shape
weight_matrix.shape

(20000, 200)

In [23]:
# Save the embedding matrix to use for later
save_path = '../Datasets/Amazon-Cat13K/processed/'
np.savetxt(save_path + 'embedding_matrix.csv', weight_matrix, delimiter=',')

### Save a new dataset with the tokenized title and description

In [24]:
# Create empty dataframe
tokenized_data = pd.DataFrame(columns = ['item_id', 'tokenized_title_and_description', 'labels'])

In [25]:
# Add the data
tokenized_data['item_id'] = data['item_id'].copy()
tokenized_data['tokenized_title_and_description'] = sequences # This is the integer version
tokenized_data['labels'] = data['labels'].copy()

In [26]:
# Check the shape of the dataframe
tokenized_data.shape

(149446, 3)

In [27]:
# Have a look at the first 3 rows
tokenized_data.head(n=3)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[13001, 3280, 4417, 19, 239, 214, 1203, 1, 295...","[Music, Movies & TV, Classical, TV]"
1,ID:0756400120,"[386, 14728, 40, 5865, 10, 493, 1180, 2959, 3,...","[Short Stories, Science Fiction & Fantasy, Ant..."
2,ID:B00024YAOQ,"[634, 153, 56, 73, 5, 95, 1, 813, 3, 4, 3385, ...","[Business Life, Business & Investing, Books, M..."


In [28]:
# Create column of token counts
tokenized_data['token_count'] = tokenized_data['tokenized_title_and_description'].apply(lambda tokens: len(tokens))

In [29]:
# Check for rows with missing values
len(tokenized_data[tokenized_data['token_count'] == 0])

20

In [30]:
# Remove rows with missing values
tokenized_data = tokenized_data[tokenized_data.token_count != 0]

In [31]:
# Remove column of token counts
tokenized_data = tokenized_data.drop('token_count', axis = 1)

In [32]:
# Check the shape of the dataframe
tokenized_data.shape

(149426, 3)

In [33]:
# Reset the index
tokenized_data = tokenized_data.reset_index(drop=True)

In [34]:
# Have a look at the first 3 rows
tokenized_data.head(n=3)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[13001, 3280, 4417, 19, 239, 214, 1203, 1, 295...","[Music, Movies & TV, Classical, TV]"
1,ID:0756400120,"[386, 14728, 40, 5865, 10, 493, 1180, 2959, 3,...","[Short Stories, Science Fiction & Fantasy, Ant..."
2,ID:B00024YAOQ,"[634, 153, 56, 73, 5, 95, 1, 813, 3, 4, 3385, ...","[Business Life, Business & Investing, Books, M..."


In [35]:
# Create save_as_csv function
def save_as_csv(df, path):
    df.to_csv(path, 
              header=True, 
              index=None, 
              encoding='latin1')

In [36]:
# Save as csv (broken up into 5 files)
num_files = 10
size = tokenized_data.shape[0] // num_files
for file_num in range(num_files):
    if file_num == 0:
        save_as_csv(tokenized_data[:size], save_path + f'tokenized_no{file_num + 1}.csv')
    elif file_num == (num_files - 1):
        save_as_csv(tokenized_data[size * file_num:], save_path + f'tokenized_no{file_num + 1}.csv')
    else:
        save_as_csv(tokenized_data[size * file_num: size * (file_num + 1)], save_path + f'tokenized_no{file_num + 1}.csv')