In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd /content/drive/My Drive/SellDetectionDatamining

/content/drive/My Drive/SellDetectionDatamining


In [3]:
ls

baomoi.model.bin           results_file.txt              wiki.vi.model.bin
[0m[01;34mdata[0m/                      sell_detection_test.v1.0.txt  word_index.json
embeddings_index-002.json  test_data.csv                 X_train.npy
[01;34mfeatures_extraction[0m/       test_data.json                X_val.npy
[01;34m_ipynb_checkpoints[0m/        test.json                     Y_train.npy
[01;34mpre-process[0m/               train_val.json                Y_val.npy
readme.md                  weights-01-0.405.h5
requirements.txt           weights-01-0.416.h5


In [4]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import GRU
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
import numpy as np
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import json
import csv
import gensim
import pandas as pd
import tensorflow as tf
import keras.backend as K

Using TensorFlow backend.


In [0]:
def load_data_training(path):

  with open(path, 'r') as json_file:
    data = json.load(json_file)
  json_file.close()

  X_train_val, labels = data['text'], data['label']
  # X_train, X_val, Y_train, Y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

  return X_train_val, labels

In [6]:
X_train_val, labels = load_data_training('train_val.json')
print(type(X_train_val[15]))
print(X_train_val[15])

<class 'list'>
['tôi', 'yêu', 'tam_kỳ', 'hội', 'những', 'người', 'bạn', 'tam_kỳ', 'quảng_nam', 'cùng', 'sinh', 'năm', 'nhâm', 'tuất', '1982', 'người', 'quảng_nam']


In [0]:
def get_vocab():
  w2v_model = gensim.models.KeyedVectors.load_word2vec_format('baomoi.model.bin', binary=True)
  vocab = list(w2v_model.vocab.keys())
  word_index = {}
  embeddings_index = {}
  for word in vocab:
    embeddings_index[word] = w2v_model.get_vector(word)
    word_index[word] = vocab.index(word)+1
  return (word_index,embeddings_index)

In [0]:
get_vocab()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [0]:
  print('Total %s word vectors.' % len(embeddings_index))
  print(embeddings_index['la'])
  print(embeddings_index('vui vẻ'))

In [0]:
def save_data():
  with open('word_index.json', 'w') as json_file:
    json.dump(word_index, json_file)
  json_file.close()

  with open('embeddings_index.json', 'w') as json_file:
    json.dump(embeddings_index, json_file)
  json_file.close()

In [0]:
def loadData_Tokenizer(X_train_val, labels, word_index):
    sequences = []
    k = 0 # number of removed label
    for i, text in enumerate(X_train_val):
        sequence = []
        for word in text:
            if word in word_index.keys():
                sequence.append(word_index[word])
        if len(sequence) == 0:
            labels.remove(labels[i-k])
            k += 1
        else:           
          sequences.append(sequence)         
     
    X_train, X_val, Y_train, Y_val = train_test_split(sequences, labels, test_size=0.2, random_state=42)
    return X_train, X_val, Y_train, Y_val

In [0]:
X_train_val, labels = load_data_training('train_val.json')

with open('word_index.json', 'r') as json_file:
  word_index = json.load(json_file)
json_file.close()

X_train, X_val, Y_train, Y_val = loadData_Tokenizer(X_train_val, labels, word_index)
mean = np.sum([len(x) for x in X_train])/len(X_train)
max = np.max([len(x) for x in X_train])
min = np.min([len(x) for x in X_train])
X_val = pad_sequences(X_val, maxlen=100)
X_train = pad_sequences(X_train, maxlen=100)

In [11]:
import matplotlib.pyplot as  plt

print(mean)
print(max)
print(min)
print(np.shape(X_train))
print(np.shape(X_train[1]))
print(X_train[100])


21.74684764275962
5760
1
(37353, 100)
(100,)
[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0  6866 37149  5859   681   102  7035  1597
   695  2173 14020   672]


In [0]:
def build_embedding_matrix(path_embeddings_index, path_word_index, EMBEDDING_DIM=400):
  with open(path_embeddings_index, 'r') as json_file:
    embeddings_index = json.load(json_file)
  json_file.close()

  with open(path_word_index, 'r') as json_file:
    word_index = json.load(json_file)
  json_file.close()

  embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))

  for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) !=len(embedding_vector):
                exit(1)
            embedding_matrix[i] = embedding_vector
  # print(np.shape(embedding_matrix))
  # print(type(embedding_matrix))
  return embedding_matrix

In [0]:
def loss_0(y_true, y_pred):
    smooth = 1e-4
    y_pred_f = tf.dtypes.cast(tf.math.argmax(y_pred, axis=1), tf.float32)
    y_true_f = tf.reshape(y_true, [-1])
    y_pred_f = tf.reshape(y_pred_f, [-1])
    label_0 = tf.reduce_sum((1.0-y_true) * (1.0-y_pred_f))
    score_0 = (label_0+smooth)/(tf.reduce_sum(1-y_true_f)+smooth)
    return 1-score_0

def loss_1(y_true, y_pred):
    smooth = 1e-4
    y_pred_f = tf.dtypes.cast(tf.math.argmax(y_pred, axis=1), tf.float32)
    y_true_f = tf.reshape(y_true, [-1])
    y_pred_f = tf.reshape(y_pred_f, [-1])
    label_1 = tf.reduce_sum(y_true * y_pred_f)
    score_1 = (label_1+smooth)/(tf.reduce_sum(y_true_f)+smooth)
    return 1-score_1

In [0]:
def Build_Model_RCNN_Text(embedding_matrix, nclasses=2, MAX_SEQUENCE_LENGTH=100, EMBEDDING_DIM=400):
    kernel_size = 2
    filters = 256
    pool_size = 2
    gru_node = 256
    
    model = Sequential()
    model.add(Embedding(np.shape(embedding_matrix)[0],
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Conv1D(filters, kernel_size, activation='relu'))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, return_sequences=True, recurrent_dropout=0.2))
    model.add(LSTM(gru_node, recurrent_dropout=0.2))
    model.add(Dense(1024,activation='relu'))
    model.add(Dense(nclasses))
    model.add(Activation('softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=[loss_0, loss_1])
    return model

In [0]:
embedding_matrix = build_embedding_matrix('embeddings_index-002.json', 'word_index.json', 400)

In [16]:
model_RCNN = Build_Model_RCNN_Text(embedding_matrix)
model_RCNN.summary()











Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.



Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 400)          175622800 
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 99, 256)           205056    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 49, 256)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 48, 256)           131328    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 24, 256)           0  

In [0]:
X_train_val, labels = load_data_training('train_val.json')

with open('word_index.json', 'r') as json_file:
  word_index = json.load(json_file)
json_file.close()

X_train, X_val, Y_train, Y_val = loadData_Tokenizer(X_train_val, labels, word_index)

X_val = pad_sequences(X_val, maxlen=100)
X_train = pad_sequences(X_train, maxlen=100)

In [0]:
from keras.callbacks import ModelCheckpoint

filepath="weights-{epoch:02d}-{val_loss:.3f}.h5"
checkpoint = ModelCheckpoint(filepath, monitor='val_loss_1', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [0]:
model_RCNN.fit(X_train, Y_train,
                # steps_per_epoch = len(X_train)//32,
                validation_data=(X_val, Y_val),
                # validation_steps=len(X_val)//32,
                epochs=10,
                batch_size=32,
                callbacks=callbacks_list,
               )

In [0]:
model_RCNN.load_weights('weights-01-0.416.h5')

In [19]:
with open('test_data.json', 'r') as jsonFile:
  data = json.load(jsonFile)['test']
jsonFile.close()
data = [d[0] for d in data if len(d) != 0]

data[:3]

[['béo', 'mà'],
 ['minh', 'la', 'tai', 'nguyen'],
 ['moi', 'bat', 'dc', 'con', 'nay', 'nhau', 'dc', 'hok']]

In [0]:
def loadData_Tokenizer(X_train_val, labels, word_index):
    sequences = []
    k = 0 # number of removed label
    for i, text in enumerate(X_train_val):
        sequence = []
        for word in text:
            if word in list(word_index.keys()):
                sequence.append(word_index[word])
        if len(sequence) == 0:
            labels.remove(labels[i-k])
            k += 1
        else:           
          sequences.append(sequence)         
     
    X_train, X_val, Y_train, Y_val = train_test_split(sequences, labels, test_size=0.2, random_state=42)
    return X_train, X_val, Y_train, Y_val

In [0]:
sequences = []
for d in data:
  sequence = []
  for word in d:
    if word in word_index.keys():
      sequence.append(word_index[word])
  sequences.append(sequence)

In [22]:
sequences[:3]

[[2522, 65],
 [1816, 2377, 2051, 11814],
 [11870, 24986, 8607, 78, 115, 141, 8607, 32923]]

In [0]:
X_test = pad_sequences(sequences, maxlen=100)

In [24]:
model_RCNN.predict(np.array([X_test[3]]))

array([[0.799466  , 0.20053406]], dtype=float32)

In [0]:
predict = np.random.randint(2, size=len(X_test))

In [26]:
predict[:10]

array([1, 0, 1, 0, 1, 1, 0, 0, 1, 0])

In [0]:
with open('results_file.txt','w') as results_file:
  for i in predict:
    if i == 0:
      results_file.write('__label__post_ban_hang\n')
    else:
      results_file.write('__label__post_khong_ban\n')
results_file.close()   