# Create embedding matrix
We'll use the word2vec module from gensim to create an embedding matrix that can be used by tensorflow

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import ast
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.models.word2vec import FAST_VERSION
from gensim.models import Word2Vec

### Load in the data

In [2]:
# Load in the data
data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
# data_list = [a] = [None]
data_location = '../Datasets/AmazonCat-13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'first_pass_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

# Delete unused var (to save memory)
del data_list

In [3]:
# Convert the labels from string to array (return unique values only)
data['labels'] = data['labels'].apply(lambda labels: list(set(ast.literal_eval(labels))))

In [4]:
# Drop rows with missing values
data = data.dropna(axis=0)

In [5]:
# Create function to join title and description
def join_title_and_description(row):
    return f'{row["title"]} {row["description"]}'

In [6]:
# Create a new column that combines the title and description
data['title_and_description'] = data.apply(lambda row: join_title_and_description(row), axis=1)

In [7]:
# Drop title and description columns (to save memory)
data = data.drop(labels=['title', 'description'], axis=1)

In [8]:
# Have a look at the shape
data.shape

(1494407, 4)

In [9]:
# Have a look at the first 3 rows
data.head(n=3)

Unnamed: 0,index,item_id,labels,title_and_description
0,0,ID:B0027DQHA0,"[Music, TV, Movies & TV, Classical]",Sao Paulo Samba (2008) Conducted by John Nesch...
1,1,ID:0756400120,"[Books, General, Science Fiction, United State...",Past Imperfect (Daw Book Collectors) This fast...
2,2,ID:B00024YAOQ,"[Motivation & Self-Improvement, Business & Inv...",Winning Every Time: How to Use the Skills of a...


### Tokenize the data

In [10]:
# Convert from df to list so it can be processed
text = data['title_and_description'].tolist()

In [11]:
# Tokenize the data
# VOCAB_SIZE = 20000 
VOCAB_SIZE = 200000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text)

In [12]:
# Create a squence from the tokens
sequences = tokenizer.texts_to_sequences(text)

In [13]:
# Delete unused var (to save memory)
del text

In [14]:
# Have a look at the first sequence
print(sequences[0])

[29260, 21551, 12365, 3328, 4450, 19, 237, 211, 1219, 1, 2781, 7, 2982, 19, 55, 28920, 6087, 3, 1991, 100, 123, 181, 1, 2781, 264, 301, 5176, 1, 3727, 9, 16, 13217, 1255, 3, 6394, 2, 1991, 100, 1500, 131, 499]


In [18]:
# Convert sequence of integers to sequence of tokens so they can be processed by Word2Vec
stringified_sequences = []
# for sequence in padded_sequences:
for sequence in sequences:
    stringified_sequence = [str(index) for index in sequence]
    stringified_sequences.append(stringified_sequence)
    del stringified_sequence # to save memory??

In [19]:
# Have a look at the first sequence of tokens
print(stringified_sequences[0])

['29260', '21551', '12365', '3328', '4450', '19', '237', '211', '1219', '1', '2781', '7', '2982', '19', '55', '28920', '6087', '3', '1991', '100', '123', '181', '1', '2781', '264', '301', '5176', '1', '3727', '9', '16', '13217', '1255', '3', '6394', '2', '1991', '100', '1500', '131', '499']


### Create the word2vec word vectors
In his original CNN-Kim paper, the author used a pre-trained word2vec embedding developed by Google. They provided the link but it's broken. So, we'll create our own word2vec embeddings for now.

In [20]:
# Check gensim version used
FAST_VERSION

1

In [21]:
# Train the word2vec word vectors (200 dimensions)
EMBEDDING_DIMENSION = 200
word_vectors = Word2Vec(sentences = stringified_sequences,
                        sg = 0, # 0 for continuous bag of words model, 1 for skip-gram model
                        size = EMBEDDING_DIMENSION, # Dimensionality of the word vectors
                        window = 10, # Maximum distance between the current and predicted word within a sentence
                        workers = 12, # Use these many worker threads to train the model 
                        iter = 10) # Run this many time through the dataset

In [22]:
# Check the number of tokens that have been trained for
len(word_vectors.wv.vocab.keys())

199999

In [23]:
# Check the dimensions of each token
word_vectors.wv.vector_size

200

In [24]:
# Delete unused var (to save memory)
del stringified_sequences

### Convert the word2vec mapping to an embedding matrix
An embedding matrix is the structure that TensorFlow will accept

In [25]:
# Create empty embedding matrix
weight_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIMENSION))

# Fill the matrix with word vectors
for i in range(VOCAB_SIZE - 1):
    weight_matrix[i + 1] = word_vectors.wv[str(i + 1)]

In [26]:
# Check the embedding matrix shape
weight_matrix.shape

(200000, 200)

In [27]:
# Save the embedding matrix to use for later
save_path = '../Datasets/AmazonCat-13K/processed/'
np.savetxt(save_path + 'embedding_matrix.csv', weight_matrix, delimiter=',')

In [28]:
# Delete unused var (to save memory)
del weight_matrix

### Save a new dataset with the tokenized title and description

In [29]:
# Create empty dataframe
tokenized_data = pd.DataFrame(columns = ['item_id', 'tokenized_title_and_description', 'labels'])

In [30]:
# Add the data
tokenized_data['item_id'] = data['item_id'].copy()
tokenized_data['tokenized_title_and_description'] = sequences # This is the integer version
tokenized_data['labels'] = data['labels'].copy()

In [31]:
# Check the shape of the dataframe
tokenized_data.shape

(1494407, 3)

In [32]:
# Have a look at the first 3 rows
tokenized_data.head(n=3)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[29260, 21551, 12365, 3328, 4450, 19, 237, 211...","[Music, TV, Movies & TV, Classical]"
1,ID:0756400120,"[381, 15160, 38609, 41, 5949, 10, 477, 1179, 3...","[Books, General, Science Fiction, United State..."
2,ID:B00024YAOQ,"[646, 150, 56, 73, 5, 99, 1, 883, 3, 4, 3470, ...","[Motivation & Self-Improvement, Business & Inv..."


In [33]:
# Create column of token counts
tokenized_data['token_count'] = tokenized_data['tokenized_title_and_description'].apply(lambda tokens: len(tokens))

In [34]:
# Check for rows with missing values
len(tokenized_data[tokenized_data['token_count'] == 0])

43

In [35]:
# Remove rows with missing values
tokenized_data = tokenized_data[tokenized_data.token_count != 0]

In [36]:
# Remove column of token counts
tokenized_data = tokenized_data.drop('token_count', axis = 1)

In [37]:
# Check the shape of the dataframe
tokenized_data.shape

(1494364, 3)

In [38]:
# Reset the index
tokenized_data = tokenized_data.reset_index(drop=True)

In [39]:
# Have a look at the first 3 rows
tokenized_data.head(n=3)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[29260, 21551, 12365, 3328, 4450, 19, 237, 211...","[Music, TV, Movies & TV, Classical]"
1,ID:0756400120,"[381, 15160, 38609, 41, 5949, 10, 477, 1179, 3...","[Books, General, Science Fiction, United State..."
2,ID:B00024YAOQ,"[646, 150, 56, 73, 5, 99, 1, 883, 3, 4, 3470, ...","[Motivation & Self-Improvement, Business & Inv..."


In [40]:
# Create save_as_csv function
def save_as_csv(df, path):
    df.to_csv(path, 
              header=True, 
              index=None, 
              encoding='latin1')

In [41]:
# Save as csv (broken up into 5 files)
num_files = 10
size = tokenized_data.shape[0] // num_files
for file_num in range(num_files):
    if file_num == 0:
        save_as_csv(tokenized_data[:size], save_path + f'tokenized_no{file_num + 1}.csv')
    elif file_num == (num_files - 1):
        save_as_csv(tokenized_data[size * file_num:], save_path + f'tokenized_no{file_num + 1}.csv')
    else:
        save_as_csv(tokenized_data[size * file_num: size * (file_num + 1)], save_path + f'tokenized_no{file_num + 1}.csv')