In [1]:
import os
import math
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

# Setup

In [2]:
train_df = pd.read_csv("./input/train.csv")
train_df = train_df.sample(frac=1.)
val_df = train_df[:400]
val_df.head()

Unnamed: 0,itemid,title,Category,image_path
605940,944297376,iphone 8 64gb garansi internasional,31,mobile_image/9bc1ddd0d2dffe99376ae6717a790323.jpg
293998,289400312,baju branded murah enfocus black floral layere...,19,fashion_image/c276bf3a21e72efb84c5487a9acadfd7
44550,1378388079,unik wardah everyday luminous liquid foundation,1,beauty_image/6bd7cebd2eb445b5cd42b3267e71a3f1.jpg
58063,548524882,peripera blur pang peach milk spf30 pa,9,beauty_image/954202b23d8aba3dad4262dda3b959e2.jpg
505984,1011955231,kaos polos lengan panjang ft0009,25,fashion_image/50298ad1451aea1f20cba930dcec404e


In [3]:
# Embdedding setup, save it in a dictionary for easier queries
embeddings_index = {}
f = open('./input/test-glove.txt')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

2196018it [02:20, 15585.44it/s]

Found 2196017 word vectors.





In [4]:
# Convert values to embeddings
def text_to_array(text):
    empyt_emb = np.zeros(300)
    text = text[:-1].split()[:100]
    embeds = [embeddings_index.get(x, empyt_emb) for x in text]
    embeds+= [empyt_emb] * (100 - len(embeds))
    return np.array(embeds)

In [5]:
val_vects = np.array([text_to_array(X_text) for X_text in (val_df["title"][:])])
val_y_labels = np.array(val_df["Category"])
val_y = np.zeros((len(val_y_labels), 58))
val_y[np.arange(len(val_y_labels)), val_y_labels] = 1

In [6]:
# Understand what a batch is made of
batch_size = 128
i = 0
texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
text_arr = np.array([text_to_array(text) for text in texts])
batch_labels = np.array(train_df["Category"][i*batch_size:(i+1)*batch_size])
batch_targets = np.zeros((batch_size, 58))
batch_targets[np.arange(batch_size), batch_labels] = 1
print(np.shape(text_arr))
print(np.shape(batch_targets))
print(text_arr)
print(batch_targets)

(128, 100, 300)
(128, 58)
[[[ 0.25687     0.38291001 -0.40415001 ...  0.24779999 -0.77301002
    0.67146999]
  [-0.48943999  0.55361998  0.44624999 ... -0.38089001  0.079228
   -0.001193  ]
  [ 0.83092999  0.19035     0.24209    ... -0.26086    -0.92707998
    0.47852999]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.50954002 -0.16789     0.32336    ...  0.41446     0.22989
    0.44521999]
  [-0.11186    -0.12540001  0.058243   ...  0.0011172   0.38797
    0.56098002]
  [-0.26808     0.20982     0.32896999 ...  0.25751999 -0.01517
   -0.0042005 ]
  ...
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]
  [ 0.          0.          0.         ...  0.          0.
    0.        ]]

 [[ 0.49079001  0.112

In [7]:
# Write generator, which 
batch_size = 128

def batch_gen(train_df):
    n_batches = math.floor(len(train_df) / batch_size)
    while True: 
        train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            text_arr = np.array([text_to_array(text) for text in texts])
            batch_labels = np.array(train_df["Category"][i*batch_size:(i+1)*batch_size])
            batch_targets = np.zeros((batch_size, 58))
            batch_targets[np.arange(batch_size), batch_labels] = 1
            yield text_arr, batch_targets

# Training

In [8]:
from keras.models import Sequential
from keras.layers import CuDNNLSTM, Dense, Bidirectional, Activation

ModuleNotFoundError: No module named 'keras'

In [None]:
model = Sequential()
model.add(Bidirectional(CuDNNLSTM(64, return_sequences=True),
                        input_shape=(100, 300)))
model.add(Bidirectional(CuDNNLSTM(64)))
model.add(Dense(58))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
mg = batch_gen(train_df)
model.fit_generator(mg, epochs=20,
                    steps_per_epoch=1000,
                    validation_data=(val_vects, val_y),
                    verbose=True)

# Inference

In [None]:
# Make the prediction from the model
batch_size = 256
def batch_gen(test_df):
    n_batches = math.ceil(len(test_df) / batch_size)
    for i in range(n_batches):
        texts = test_df.iloc[i*batch_size:(i+1)*batch_size, 1]
        text_arr = np.array([text_to_array(text) for text in texts])
        yield text_arr

test_df = pd.read_csv("../input/ndsc-beginner/test.csv")

all_preds = []
for x in tqdm(batch_gen(test_df)):
    all_preds.extend(model.predict(x))

In [None]:
print(np.shape(all_preds))
y_te = [np.argmax(pred) for pred in all_preds]

submit_df = pd.DataFrame({"itemid": test_df["itemid"], "Category": y_te})
submit_df.to_csv("submission.csv", index=False)

In [None]:
submit_df