# CNN-Kim Model
I can't get it exactly the same now because I'm using custom-trained word vectors instead of pre-trained ones, but I can still try to build a similar architecture.

Since there are different variations of his model, I'll start with a model with the following architecture:
- Using custom-trained word vectors (dimension = 200)
- Keeping those word vectors static during the trainin process (maybe - TBD while implementing)
- Single-channel
- Filter windows height of 3, 4, 5 with 100 feature maps each
- MaxPooling to extract the features from each feature map
- Dropout rate of 0.5
- ReLu activation function 

In [1]:
# Import the libraries
import pandas as pd
import numpy as np
import ast
from gensim.models import KeyedVectors

In [2]:
# For multiprocessing
# from pandarallel import pandarallel
# pandarallel.initialize()

### Load in the data

In [3]:
# Load in the data
# data_list = [a, b, c, d, e, f, g, h, i ,f] = [None, None, None, None, None, None, None, None, None, None]
data_list = [a] = [None]
data_location = '../Datasets/Amazon-Cat13K/processed/' 
for i in range(len(data_list)):
    data_list[i] = pd.read_csv(data_location + f'tokenized_no{i + 1}.csv', encoding='latin1')
    
# Concatenate all the data and reset the index
data = pd.concat(data_list, sort=False)
data = data.reset_index()

In [4]:
# Convery tringged arrays to arrays
data['tokenized_title_and_description'] = data['tokenized_title_and_description'].apply(lambda stringged_array: ast.literal_eval(stringged_array))
data['labels'] = data['labels'].apply(lambda stringged_array: ast.literal_eval(stringged_array))

In [5]:
# Check the first 5 rows
data.head(n=5)

Unnamed: 0,index,item_id,tokenized_title_and_description,labels
0,0,ID:B0027DQHA0,"[sao, paulo, samba, 2008, conducted, by, john,...","[TV, Classical, Movies & TV, Music]"
1,1,ID:0756400120,"[past, imperfect, daw, book, collectors, this,...","[Science Fiction, Anthologies & Literary Colle..."
2,2,ID:B00024YAOQ,"[winning, every, time, how, to, use, the, skil...","[Books, Business & Investing, Business Life, M..."
3,3,ID:B000BUGXAU,"[nano, cube, 24, gallon, deluxe, just, add, wa...","[Aquariums, Pet Supplies, Fish & Aquatic Pets]"
4,4,ID:B0007YMWC8,"[en, tijuana, 2005, an, honest, citizen, is, f...","[Movies, Movies & TV]"


In [6]:
# Check the shape
data.shape

(149436, 4)

In [7]:
# Load in the word vectors
word_vectors = KeyedVectors.load('../Datasets/Amazon-Cat13K/processed/word_vectors.kv', mmap='r')

In [8]:
# Have a look at one of the wrod vectors
word_vectors['action']

memmap([-1.1571587 ,  0.3755734 , -2.413305  ,  0.3791015 ,  0.86517483,
         1.9800457 , -1.0056863 , -4.9373865 ,  1.4850005 , -0.5015435 ,
         2.7069852 , -2.2903554 ,  2.264299  , -0.14577731, -2.9842777 ,
        -1.4556262 , -0.88341886, -1.1524132 , -2.9301097 ,  1.1619096 ,
        -2.22303   ,  2.3586771 ,  1.3999814 , -3.0405831 ,  1.4076226 ,
         1.6056384 ,  1.4500986 ,  1.2323626 ,  4.9932084 ,  2.056675  ,
        -0.6050284 , -3.3375168 ,  0.81510854, -1.9667872 , -5.349025  ,
        -0.98436356, -0.41073835, -0.2500507 , -0.24185303,  0.76881033,
         4.458229  ,  0.3492192 , -1.3678042 ,  2.1281562 , -1.9773943 ,
        -0.9298466 , -3.1515818 ,  1.1537467 ,  2.635891  ,  0.8529285 ,
         0.2760713 , -1.2654804 , -0.27836585, -1.4895313 , -3.5696862 ,
         4.4550405 , -0.5029396 ,  0.03418916,  0.86786187, -1.3967632 ,
        -1.7252482 ,  0.43383378, -1.1031289 , -2.651516  ,  0.0401509 ,
        -0.78957796, -0.1825394 , -0.6357223 , -1.0

### Convert sequences to a format that can be used for training

In [9]:
X_raw = np.array(list(data['tokenized_title_and_description']))

In [10]:
# Have a look at the shape 
X_raw.shape

(149436,)

In [11]:
X = []
for token_sequence in X_raw[0:100]:
    vectorized_sequence = []
    for token in token_sequence:
        vectorized_sequence.append(word_vectors[token])
    X.append(vectorized_sequence)
X = np.array(X)

In [12]:
import sys
sys.getsizeof(X) / 1000000

0.000896

In [13]:
X.shape

(100,)

In [15]:
X[0:1]

array([list([memmap([-0.43538094, -1.4072955 , -1.414434  ,  1.1366837 ,  0.03777919,
         1.3251375 , -0.03762056, -2.1646078 ,  0.02649578,  1.7500367 ,
         1.9820149 , -2.1572933 , -2.327317  ,  0.8916912 , -0.5998995 ,
        -0.73966926,  1.619279  , -0.86041504,  1.280118  , -0.8059234 ,
         0.31256247,  1.6408881 ,  0.25213426, -2.0899847 ,  0.17276576,
        -1.2696799 , -1.0556309 , -0.5114048 ,  0.33222872, -0.80512184,
         2.4693246 , -1.233377  , -0.7111182 ,  0.3425852 ,  0.6439264 ,
         1.5413823 ,  0.19246303,  1.5041468 ,  1.6322325 , -0.94651145,
         0.70242655,  1.1946902 , -2.6861186 ,  1.0783076 , -0.44077417,
        -1.5874454 , -1.915421  ,  0.44575557,  0.49640387, -1.3670912 ,
        -2.5550306 , -0.8470877 ,  2.436401  , -0.4274904 , -0.4350744 ,
         1.8518713 ,  2.0888762 , -0.9078512 , -0.5654248 ,  0.7393392 ,
        -1.1276573 , -0.6047865 ,  0.70109874,  0.5248692 ,  2.7745314 ,
        -0.3420635 ,  4.0432396 , -2.2