# Create the word vectors
We'll use the word2vec module from gensim

In [1]:
# Import the libraries
import pandas as pd
import ast
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec

### Load in the data

In [2]:
# Load in the data
# data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
data_list = [a] = [None]
data_location = '../Datasets/Amazon-Cat13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'first_pass_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data
data = pd.concat(data_list, sort=False)

In [3]:
# Convert the labels from string to array (return unique values only)
data['labels'] = data['labels'].apply(lambda labels: list(set(ast.literal_eval(labels))))

In [4]:
# Drop rows with missing values
data = data.dropna(axis=0)

In [5]:
# Create function to join title and description
def join_title_and_description(row):
    return row['title'] + ' ' + row['description']

In [6]:
# Create a new column that combines the title and description
data['title_and_description'] = data.apply(lambda row: join_title_and_description(row), axis=1)

In [7]:
# Have a look at the shape
data.shape

(149446, 5)

In [8]:
# Have a look at the data
data.head(n=5)

Unnamed: 0,item_id,title,description,labels,title_and_description
0,ID:B0027DQHA0,Sao Paulo Samba (2008),"Conducted by John Neschling since 1997, the or...","[Movies & TV, TV, Music, Classical]",Sao Paulo Samba (2008) Conducted by John Nesch...
1,ID:0756400120,Past Imperfect (Daw Book Collectors),"This fast, lightweight anthology of 12 time-tr...","[Science Fiction, Short Stories, Literature & ...",Past Imperfect (Daw Book Collectors) This fast...
2,ID:B00024YAOQ,Winning Every Time: How to Use the Skills of a...,Whether you're hoping to obtain a raise from y...,"[Books, Business & Investing, Business Life, M...",Winning Every Time: How to Use the Skills of a...
3,ID:B000BUGXAU,Nano Cube 24 Gallon Deluxe,Just add water!\tThe Nano Cube is a 24-gallon ...,"[Pet Supplies, Fish & Aquatic Pets, Aquariums]",Nano Cube 24 Gallon Deluxe Just add water!\tTh...
4,ID:B0007YMWC8,Asalto En Tijuana (2005),An honest citizen is forced to steal the world...,"[Movies & TV, Movies]",Asalto En Tijuana (2005) An honest citizen is ...


### Tokenize the data

In [9]:
# Convert from df to list
text = data['title_and_description'].tolist()

In [10]:
# Tokenize the data
# VOCAB_SIZE = 203882 # This is the reported number of features 
VOCAB_SIZE = 10000
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text)

In [11]:
# Create a squence from the tokens
sequences = tokenizer.texts_to_sequences(text)

In [12]:
# Have a look at the first sequence
print(sequences[0])

[3280, 4417, 19, 239, 214, 1203, 1, 2950, 7, 2916, 19, 55, 6547, 3, 1999, 97, 127, 181, 1, 2950, 265, 304, 5592, 1, 3701, 9, 16, 1267, 3, 6024, 2, 1999, 97, 1474, 131, 498]


In [13]:
# Check that 0 is unused
0 in tokenizer.index_word.keys()

False

In [14]:
# Add padding to the sequences
MAX_SEQUENCE_LENGTH = 300
padded_sequences = pad_sequences(sequences,
                                 maxlen=MAX_SEQUENCE_LENGTH,
                                 padding='post') # Add padding to the end if needs padding

In [15]:
# Have a look at the first sequence
print(padded_sequences[0])

[3280 4417   19  239  214 1203    1 2950    7 2916   19   55 6547    3
 1999   97  127  181    1 2950  265  304 5592    1 3701    9   16 1267
    3 6024    2 1999   97 1474  131  498    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [16]:
# Add padding to the list of tokens 
tokenizer.index_word[0] = '<pad>'

In [17]:
# Convert sequence of integers to sequence of tokens
token_sequences = []
for sequence in padded_sequences:
    token_sequence = []
    for index in sequence:
        token_sequence.append(tokenizer.index_word[index])
    token_sequences.append(token_sequence)

In [18]:
# Have a look at the first sequence of tokens
print(token_sequences[0])

['2008', 'conducted', 'by', 'john', 'since', '1997', 'the', 'orchestra', 'is', 'defined', 'by', 'its', 'interpretations', 'of', 'latin', 'american', 'music', 'here', 'the', 'orchestra', 'yet', 'again', 'grips', 'the', 'listener', 'with', 'an', 'selection', 'of', 'brazilian', 'and', 'latin', 'american', 'classics', 'including', 'w', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', 

### Create the word2vec word vectors
In his original CNN-Kim paper, the author used a pre-trained word2vec embedding developed by Google. They provided the link but it's broken. So, we'll create our own word2vec embeddings for now.

In [19]:
# Train the word2vec word embeddings (50 dimensions)
word2vec_word_embeddings = Word2Vec(sentences = token_sequences,
                           sg = 0, # 0 for continuous bag of words model, 1 for skip-gram model
                           size = 300, # Dimensionality of the word vectors
                           window = 5, # Maximum distance between the current and predicted word within a sentence
                           workers = 10, # Use these many worker threads to train the model
                           iter = 5) # Number of iterations (epochs) over the corpus

In [20]:
# Check the number of tokens that have been trained for
len(word2vec_word_embeddings.wv.vocab.keys())

9999

In [21]:
# Check the dimensions of each token
word2vec_word_embeddings.wv.vector_size

300

In [22]:
# Have a look at the words that are most similar to 'action'
word2vec_word_embeddings.wv.most_similar('action')

[('gameplay', 0.5068792104721069),
 ('batman', 0.48384609818458557),
 ('adventure', 0.46568042039871216),
 ('suspense', 0.46249258518218994),
 ('sequences', 0.4161718487739563),
 ('combat', 0.4145388603210449),
 ('animation', 0.4104458689689636),
 ('superman', 0.40703117847442627),
 ('mayhem', 0.40631985664367676),
 ('climax', 0.4053933024406433)]

In [23]:
# Have a look at the word embedding for 'action'
print(word2vec_word_embeddings['action']);

[ 4.19863790e-01 -1.51247203e+00 -2.39675546e+00  1.26724958e+00
  1.42668235e+00  1.17790198e+00 -3.58591318e-01  1.96082257e-02
  1.72476992e-01 -1.01584601e+00  5.88264287e-01  3.07916820e-01
 -4.09925699e-01  1.14585117e-01  6.50731381e-05  1.85358375e-01
  8.49683940e-01  1.04901217e-01 -3.70536074e-02  1.98649690e-01
 -2.24202537e-04 -3.78435940e-01 -5.53013645e-02  5.38876534e-01
  5.75046122e-01 -9.45357025e-01  7.90321231e-01 -1.74390852e-01
 -5.28430939e-01  9.73555326e-01  5.63493650e-03  8.87058258e-01
 -1.13490546e+00  2.61756450e-01 -5.24946094e-01  8.54004622e-01
  2.84199357e-01  7.81403780e-01  1.03920424e+00  8.72253656e-01
  1.63375735e+00 -9.58525062e-01  2.76313007e-01  3.03469419e-01
  4.07338619e-01 -1.47377133e-01  1.54876113e-01  2.22706628e+00
  4.91876006e-01  2.67607421e-01  6.86423779e-01 -1.32721737e-01
  2.29413897e-01  1.63715646e-01 -8.65637720e-01 -1.16545832e+00
 -7.34755635e-01  9.45155442e-01  1.64421409e-01  8.67005885e-01
  3.94191265e-01  1.28433

  


In [24]:
# Save the word vectors
word2vec_word_embeddings.wv.save('../Datasets/Amazon-Cat13K/processed/word_vectors.kv')

### Save a new dataset with the tokenized words

In [25]:
# Create empty dataframe
tokenized_data = pd.DataFrame(columns = ['item_id', 'tokenized_title_and_description', 'labels'])

In [26]:
# Add the data
tokenized_data['item_id'] = data['item_id'].copy()
tokenized_data['tokenized_title_and_description'] = token_sequences
tokenized_data['labels'] = data['labels'].copy()

In [27]:
# Have a look at the first 5 rows
tokenized_data.head(n=5)

Unnamed: 0,item_id,tokenized_title_and_description,labels
0,ID:B0027DQHA0,"[2008, conducted, by, john, since, 1997, the, ...","[Movies & TV, TV, Music, Classical]"
1,ID:0756400120,"[machine, but, a, future, version, of, himself...","[Science Fiction, Short Stories, Literature & ..."
2,ID:B00024YAOQ,"[winning, every, time, how, to, use, the, skil...","[Books, Business & Investing, Business Life, M..."
3,ID:B000BUGXAU,"[nano, cube, 24, gallon, deluxe, just, add, wa...","[Pet Supplies, Fish & Aquatic Pets, Aquariums]"
4,ID:B0007YMWC8,"[en, 2005, an, honest, citizen, is, forced, to...","[Movies & TV, Movies]"


In [29]:
# Create save_as_csv function
def save_as_csv(df, path):
    df.to_csv(path, 
              header=True, 
              index=None, 
              encoding='latin1')

In [32]:
# Save as csv (broken up into 5 files)
num_files = 10
size = tokenized_data.shape[0] // num_files
save_path = '../Datasets/Amazon-Cat13K/processed/tokenized'
for file_num in range(num_files):
    if file_num == 0:
        save_as_csv(tokenized_data[:size], save_path + f'_no{file_num + 1}.csv')
    elif file_num == (num_files - 1):
        save_as_csv(tokenized_data[size * file_num:], save_path + f'_no{file_num + 1}.csv')
    else:
        save_as_csv(tokenized_data[size * file_num: size * (file_num + 1)], save_path + f'_no{file_num + 1}.csv')

In [28]:
# from gensim.models import KeyedVectors
# test = KeyedVectors.load('../Datasets/Amazon-Cat13K/processed/word_vectors.kv', mmap='r')
# test['action']