# Create word vectors
We'll use the word2vec module from gensim

In [1]:
# Import the libraries
import pandas as pd
import ast
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models.word2vec import FAST_VERSION
from gensim.models import Word2Vec

### Load in the data

In [2]:
# Load in the data
data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
# data_list = [a] = [None]
data_location = '../Datasets/Amazon-Cat13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'first_pass_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data
data = pd.concat(data_list, sort=False)

In [3]:
# Convert the labels from string to array (return unique values only)
data['labels'] = data['labels'].apply(lambda labels: list(set(ast.literal_eval(labels))))

In [4]:
# Drop rows with missing values
data = data.dropna(axis=0)

In [5]:
# Create function to join title and description
def join_title_and_description(row):
    return row['title'] + ' ' + row['description']

In [6]:
# Create a new column that combines the title and description
data['title_and_description'] = data.apply(lambda row: join_title_and_description(row), axis=1)

In [7]:
# Have a look at the shape
data.shape

(1494407, 5)

In [8]:
# Have a look at the data
data.head(n=5)

Unnamed: 0,item_id,title,description,labels,title_and_description
0,ID:B0027DQHA0,Sao Paulo Samba (2008),"Conducted by John Neschling since 1997, the or...","[TV, Classical, Movies & TV, Music]",Sao Paulo Samba (2008) Conducted by John Nesch...
1,ID:0756400120,Past Imperfect (Daw Book Collectors),"This fast, lightweight anthology of 12 time-tr...","[Science Fiction, Anthologies & Literary Colle...",Past Imperfect (Daw Book Collectors) This fast...
2,ID:B00024YAOQ,Winning Every Time: How to Use the Skills of a...,Whether you're hoping to obtain a raise from y...,"[Books, Business & Investing, Business Life, M...",Winning Every Time: How to Use the Skills of a...
3,ID:B000BUGXAU,Nano Cube 24 Gallon Deluxe,Just add water!\tThe Nano Cube is a 24-gallon ...,"[Aquariums, Pet Supplies, Fish & Aquatic Pets]",Nano Cube 24 Gallon Deluxe Just add water!\tTh...
4,ID:B0007YMWC8,Asalto En Tijuana (2005),An honest citizen is forced to steal the world...,"[Movies, Movies & TV]",Asalto En Tijuana (2005) An honest citizen is ...


### Tokenize the data

In [9]:
# Convert from df to list
text = data['title_and_description'].tolist()

In [10]:
# Tokenize the data
# VOCAB_SIZE = 203882 # This is the reported number of features 
VOCAB_SIZE = 200000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text)

In [11]:
# Create a squence from the tokens
sequences = tokenizer.texts_to_sequences(text)

In [12]:
# Have a look at the first sequence
print(sequences[0])

[29260, 21551, 12365, 3328, 4450, 19, 237, 211, 1219, 1, 2781, 7, 2982, 19, 55, 28920, 6087, 3, 1991, 100, 123, 181, 1, 2781, 264, 301, 5176, 1, 3727, 9, 16, 13217, 1255, 3, 6394, 2, 1991, 100, 1500, 131, 499]


In [13]:
# # Check that 0 is unused
# 0 in tokenizer.index_word.keys()

In [14]:
# # Add padding to the sequences
# MAX_SEQUENCE_LENGTH = 200
# padded_sequences = pad_sequences(sequences,
#                                  maxlen=MAX_SEQUENCE_LENGTH,
#                                  padding='post') # Add padding to the end if needs padding

In [15]:
# # Have a look at the first sequence
# print(padded_sequences[0])

In [16]:
# # Add padding to the list of tokens 
# tokenizer.index_word[0] = '<pad>'

In [17]:
# Convert sequence of integers to sequence of tokens
token_sequences = []
# for sequence in padded_sequences:
for sequence in sequences:
    token_sequence = []
    for index in sequence:
        token_sequence.append(tokenizer.index_word[index])
    token_sequences.append(token_sequence)

In [18]:
# Have a look at the first sequence of tokens
print(token_sequences[0])

['sao', 'paulo', 'samba', '2008', 'conducted', 'by', 'john', 'since', '1997', 'the', 'orchestra', 'is', 'defined', 'by', 'its', 'emblematic', 'interpretations', 'of', 'latin', 'american', 'music', 'here', 'the', 'orchestra', 'yet', 'again', 'grips', 'the', 'listener', 'with', 'an', 'electrifying', 'selection', 'of', 'brazilian', 'and', 'latin', 'american', 'classics', 'including', 'w']


### Create the word2vec word vectors
In his original CNN-Kim paper, the author used a pre-trained word2vec embedding developed by Google. They provided the link but it's broken. So, we'll create our own word2vec embeddings for now.

In [19]:
# Check gensim version used
FAST_VERSION

1

In [20]:
# Train the word2vec word vectors (200 dimensions)
word_vectors = Word2Vec(sentences = token_sequences,
                        sg = 0, # 0 for continuous bag of words model, 1 for skip-gram model
                        size = 200, # Dimensionality of the word vectors
                        window = 10, # Maximum distance between the current and predicted word within a sentence
                        workers = 12, # Use these many worker threads to train the model 
                        iter = 10) # Run this many time through the dataset

In [21]:
# Check the number of tokens that have been trained for
len(word_vectors.wv.vocab.keys())

199999

In [22]:
# Check the dimensions of each token
word_vectors.wv.vector_size

200

In [23]:
# Have a look at the words that are most similar to 'action'
word_vectors.wv.most_similar('action')

[('shootouts', 0.6199280023574829),
 ('gameplay', 0.5675157308578491),
 ('storyline', 0.5605520606040955),
 ('adrenaline', 0.5595264434814453),
 ('cutscenes', 0.5381199717521667),
 ('gunplay', 0.5286637544631958),
 ('sequences', 0.5226418972015381),
 ('nonstop', 0.5218201279640198),
 ('thrills', 0.5078117847442627),
 ('showdowns', 0.5074906349182129)]

In [120]:
# Have a look at the word embedding for 'action'
print(word_vectors.wv['action']);

[-1.1571587   0.3755734  -2.413305    0.3791015   0.86517483  1.9800457
 -1.0056863  -4.9373865   1.4850005  -0.5015435   2.7069852  -2.2903554
  2.264299   -0.14577731 -2.9842777  -1.4556262  -0.88341886 -1.1524132
 -2.9301097   1.1619096  -2.22303     2.3586771   1.3999814  -3.0405831
  1.4076226   1.6056384   1.4500986   1.2323626   4.9932084   2.056675
 -0.6050284  -3.3375168   0.81510854 -1.9667872  -5.349025   -0.98436356
 -0.41073835 -0.2500507  -0.24185303  0.76881033  4.458229    0.3492192
 -1.3678042   2.1281562  -1.9773943  -0.9298466  -3.1515818   1.1537467
  2.635891    0.8529285   0.2760713  -1.2654804  -0.27836585 -1.4895313
 -3.5696862   4.4550405  -0.5029396   0.03418916  0.86786187 -1.3967632
 -1.7252482   0.43383378 -1.1031289  -2.651516    0.0401509  -0.78957796
 -0.1825394  -0.6357223  -1.0993905  -0.60318273 -2.6644683  -1.1847291
  1.3930279   3.6436045   1.8687613   1.8572172   2.2359045  -2.9857657
  4.6126146   1.1070437  -2.9841766   1.5190953  -2.7302043   1

In [50]:
# Save the word vectors
word_vectors.wv.save('../Datasets/Amazon-Cat13K/processed/word_vectors.kv')

### Save a new dataset with the tokenized words

In [129]:
# Create empty dataframe
tokenized_data = pd.DataFrame(columns = ['item_id', 'tokenized_title_and_description', 'labels'])

In [130]:
# Add the data
tokenized_data['item_id'] = data['item_id'].copy()
tokenized_data['tokenized_title_and_description'] = token_sequences
tokenized_data['labels'] = data['labels'].copy()

In [131]:
# Check the shape of the dataframe
tokenized_data.shape

(1494407, 3)

In [132]:
# Have a look at the first 5 rows
tokenized_data.head(n=5)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[sao, paulo, samba, 2008, conducted, by, john,...","[TV, Classical, Movies & TV, Music]"
1,ID:0756400120,"[past, imperfect, daw, book, collectors, this,...","[Science Fiction, Anthologies & Literary Colle..."
2,ID:B00024YAOQ,"[winning, every, time, how, to, use, the, skil...","[Books, Business & Investing, Business Life, M..."
3,ID:B000BUGXAU,"[nano, cube, 24, gallon, deluxe, just, add, wa...","[Aquariums, Pet Supplies, Fish & Aquatic Pets]"
4,ID:B0007YMWC8,"[en, tijuana, 2005, an, honest, citizen, is, f...","[Movies, Movies & TV]"


In [133]:
# Create column of token counts
tokenized_data['token_count'] = tokenized_data['tokenized_title_and_description'].apply(lambda tokens: len(tokens))

In [134]:
# Check for rows with missing values
len(tokenized_data[tokenized_data['token_count'] == 0])

43

In [135]:
# Remove rows with missing values
tokenized_data = tokenized_data[tokenized_data.token_count != 0]

In [136]:
# Remove column of token counts
tokenized_data = tokenized_data.drop('token_count', axis = 1)

In [137]:
# Check the shape of the dataframe
tokenized_data.shape

(1494364, 3)

In [138]:
# Reset the index
tokenized_data = tokenized_data.reset_index(drop=True)

In [139]:
# Have a look at the first 5 rows
tokenized_data.head(n=5)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[sao, paulo, samba, 2008, conducted, by, john,...","[TV, Classical, Movies & TV, Music]"
1,ID:0756400120,"[past, imperfect, daw, book, collectors, this,...","[Science Fiction, Anthologies & Literary Colle..."
2,ID:B00024YAOQ,"[winning, every, time, how, to, use, the, skil...","[Books, Business & Investing, Business Life, M..."
3,ID:B000BUGXAU,"[nano, cube, 24, gallon, deluxe, just, add, wa...","[Aquariums, Pet Supplies, Fish & Aquatic Pets]"
4,ID:B0007YMWC8,"[en, tijuana, 2005, an, honest, citizen, is, f...","[Movies, Movies & TV]"


In [140]:
# Create save_as_csv function
def save_as_csv(df, path):
    df.to_csv(path, 
              header=True, 
              index=None, 
              encoding='latin1')

In [141]:
# Save as csv (broken up into 5 files)
num_files = 10
size = tokenized_data.shape[0] // num_files
save_path = '../Datasets/Amazon-Cat13K/processed/tokenized'
for file_num in range(num_files):
    if file_num == 0:
        save_as_csv(tokenized_data[:size], save_path + f'_no{file_num + 1}.csv')
    elif file_num == (num_files - 1):
        save_as_csv(tokenized_data[size * file_num:], save_path + f'_no{file_num + 1}.csv')
    else:
        save_as_csv(tokenized_data[size * file_num: size * (file_num + 1)], save_path + f'_no{file_num + 1}.csv')