In [2]:
from gensim.models import Word2Vec
import multiprocessing
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from numpy import asarray, zeros
from tensorflow.keras.layers import Embedding, Dense, Activation, Dropout, CuDNNLSTM
from tensorflow.keras import Model, Input, optimizers, callbacks
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.metrics
from sklearn.metrics import classification_report
import numpy as np
from tensorflow.keras.regularizers import l1_l2
import os

In [3]:
train_set = pd.read_csv('train_clean_set.csv')
train_set = train_set[train_set['word_count'] <= 150]

In [4]:
train_set.head()

Unnamed: 0,search_text,label,word_count,length,avg_word
0,áo khoác adidas nu,1,4,18,3.75
1,mỗi thùng bia đều có thẻ cào hả chị,0,9,35,3.0
2,hàng mới về váy đũi nhún eo AMOUNT màu y hình ...,1,12,52,3.416667
3,polo adidas bayer AMOUNT cotton ạ size s âu dà...,1,17,79,3.705882
4,adidas uk AMOUNT cân xanh size AMOUNT hồng siz...,1,10,56,4.7


In [5]:
train_set.describe()

Unnamed: 0,label,word_count,length,avg_word
count,34085.0,34085.0,34085.0,34085.0
mean,0.477101,57.119349,262.530703,3.635064
std,0.499483,36.262552,168.013332,0.465427
min,0.0,0.0,0.0,0.0
25%,0.0,28.0,128.0,3.347826
50%,0.0,53.0,242.0,3.583333
75%,1.0,82.0,373.0,3.857143
max,1.0,150.0,897.0,9.0


In [6]:
sentences = train_set['search_text'].apply(lambda x: str(x).split()).values
print(sentences[:5])

[list(['áo', 'khoác', 'adidas', 'nu'])
 list(['mỗi', 'thùng', 'bia', 'đều', 'có', 'thẻ', 'cào', 'hả', 'chị'])
 list(['hàng', 'mới', 'về', 'váy', 'đũi', 'nhún', 'eo', 'AMOUNT', 'màu', 'y', 'hình', 'AMOUNT'])
 list(['polo', 'adidas', 'bayer', 'AMOUNT', 'cotton', 'ạ', 'size', 's', 'âu', 'dành', 'cho', 'ae', 'AMOUNT', 'ạ', 'giá', 'AMOUNT', 'sẵn'])
 list(['adidas', 'uk', 'AMOUNT', 'cân', 'xanh', 'size', 'AMOUNT', 'hồng', 'size', 'AMOUNT3y'])]


In [25]:
EMB_DIM = 300

w2v = Word2Vec(sentences, size=EMB_DIM, sg=1, window=1,min_count=5,iter=20,workers=multiprocessing.cpu_count(),sorted_vocab=1)

In [26]:
word_vectors = w2v.wv
result = word_vectors.similar_by_word('dell')
print("Most similar:\n", result[:10])
words = list(w2v.wv.vocab)
print('Vocabulary size: %d'%len(words))
filename = 'embedding_word2vec.txt'
word_vectors.save_word2vec_format(filename, binary=False)

Most similar:
 [('acer', 0.6284576058387756), ('mf839', 0.6206846237182617), ('mqd32', 0.583571195602417), ('️laptop', 0.5593997240066528), ('asus', 0.5572055578231812), ('lx', 0.5551203489303589), ('i5', 0.5485771298408508), ('ip7ip8', 0.5358346700668335), ('mtb', 0.5327499508857727), ('a6plusj8', 0.5314415693283081)]
Vocabulary size: 8333


In [27]:
def load_embedding(filename):
    file = open(filename, 'r')
    lines = file.readlines()[1:]
    file.close()
    embedding = dict()
    for line in lines:
        parts = line.split()
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [28]:
def get_weight_matrix(embedding, vocab, MAX_WORDS):
    weight_matrix = np.zeros((MAX_WORDS, 300))
    num_loss = 0
    for word, i in vocab.items():
        if i < MAX_WORDS:
            try:
                weight_matrix[i] = embedding[word]
            except:
                num_loss += 1
    print("Number of loss weight: {}".format(num_loss))
    return weight_matrix

In [29]:
MAX_WORDS = 8000
MAX_LEN = 150

In [30]:
vocab = Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
vocab.fit_on_texts(train_set.search_text.astype(str))
vocab_size = len(vocab.word_index) + 1

In [31]:
print(vocab_size)

29347


In [32]:
raw_embedding = load_embedding('embedding_word2vec.txt')
embedding_vectors = get_weight_matrix(raw_embedding, vocab.word_index, MAX_WORDS)
embedding_layer = Embedding(MAX_WORDS,300,weights=[embedding_vectors],input_length=MAX_LEN, trainable=True)

Number of loss weight: 182


In [33]:
def RNN():
    inputs = Input(shape=[MAX_LEN])
    layer = embedding_layer(inputs)
    layer = CuDNNLSTM(64, activity_regularizer=l1_l2(0,0))(layer)
    layer = Dense(256)(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(1)(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    return model

In [34]:
model = RNN()
model.summary()
model.compile(optimizer=optimizers.RMSprop(), loss='binary_crossentropy', metrics=['accuracy', 'Precision', 'Recall'])

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 150)]             0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 300)          2400000   
_________________________________________________________________
cu_dnnlstm_1 (CuDNNLSTM)     (None, 64)                93696     
_________________________________________________________________
dense_2 (Dense)              (None, 256)               16640     
_________________________________________________________________
activation_2 (Activation)    (None, 256)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257 

In [35]:
X_train = pad_sequences(vocab.texts_to_sequences(train_set.search_text.astype(str)),maxlen=MAX_LEN)
Y_train = train_set.label

In [36]:
checkpoint_path = "training/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


ES = callbacks.EarlyStopping(monitor='val_loss',min_delta=0.0001)
CP = callbacks.ModelCheckpoint(checkpoint_path, save_weights_only=True, monitor="val_acc", mode="max", save_best_only=True, verbose=1)

cplist = [CP]

In [37]:
model.fit(X_train, Y_train, batch_size=128, epochs=20, validation_split=0.2, callbacks=cplist)

Train on 27268 samples, validate on 6817 samples
Epoch 1/20
Epoch 00001: val_acc improved from -inf to 0.86328, saving model to training/cp.ckpt
Epoch 2/20
Epoch 00002: val_acc improved from 0.86328 to 0.93091, saving model to training/cp.ckpt
Epoch 3/20
Epoch 00003: val_acc improved from 0.93091 to 0.94088, saving model to training/cp.ckpt
Epoch 4/20
Epoch 00004: val_acc did not improve from 0.94088
Epoch 5/20
Epoch 00005: val_acc did not improve from 0.94088
Epoch 6/20
Epoch 00006: val_acc did not improve from 0.94088
Epoch 7/20
Epoch 00007: val_acc improved from 0.94088 to 0.94264, saving model to training/cp.ckpt
Epoch 8/20
Epoch 00008: val_acc improved from 0.94264 to 0.94587, saving model to training/cp.ckpt
Epoch 9/20
Epoch 00009: val_acc did not improve from 0.94587
Epoch 10/20
Epoch 00010: val_acc did not improve from 0.94587
Epoch 11/20
Epoch 00011: val_acc improved from 0.94587 to 0.94836, saving model to training/cp.ckpt
Epoch 12/20
Epoch 00012: val_acc did not improve from

<tensorflow.python.keras.callbacks.History at 0x7fb4f9eda908>

In [38]:
!ls {checkpoint_dir}

checkpoint		     cp.ckpt.data-00001-of-00002
cp.ckpt.data-00000-of-00002  cp.ckpt.index


In [39]:
test_dataset = pd.read_csv('test_clean_set.csv')
X_test = test_dataset.search_text.astype(str)
Y_test = test_dataset.label
X_test_matrix = pad_sequences(vocab.texts_to_sequences(X_test), maxlen=MAX_LEN)

In [40]:
accr = model.evaluate(X_test_matrix, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n  Recall: {:0.3f}\n  Precision: {:0.3f}'.format(accr[0],accr[1],accr[2],accr[3]))

Test set
  Loss: 0.680
  Accuracy: 0.923
  Recall: 0.923
  Precision: 0.920


In [41]:
Y_predicted = model.predict(X_test_matrix).transpose()[0].round()
print(classification_report(Y_test.values, Y_predicted, digits=4))

              precision    recall  f1-score   support

           0     0.9222    0.9251    0.9237      5049
           1     0.9234    0.9204    0.9219      4951

    accuracy                         0.9228     10000
   macro avg     0.9228    0.9228    0.9228     10000
weighted avg     0.9228    0.9228    0.9228     10000



In [42]:
model.load_weights(checkpoint_path)
Y_predicted = model.predict(X_test_matrix).transpose()[0].round()
print(classification_report(Y_test.values, Y_predicted, digits=4))

              precision    recall  f1-score   support

           0     0.9306    0.9220    0.9263      5049
           1     0.9212    0.9299    0.9255      4951

    accuracy                         0.9259     10000
   macro avg     0.9259    0.9259    0.9259     10000
weighted avg     0.9259    0.9259    0.9259     10000

