In [138]:
import tensorflow as tf
import pandas as pd
import numpy as np
from string import digits
from collections import Counter
from pyvi import ViTokenizer
from gensim.models.word2vec import Word2Vec
from keras.utils.np_utils import to_categorical
import re
import matplotlib.pyplot as plt
%matplotlib inline
import string
import random

In [139]:
data_train = pd.read_csv("Train2.csv")
data_test = pd.read_csv("Test2.csv")
print(data_test.shape)

(1971, 2)


In [140]:
labels = data_train.iloc[:, 1].values
reviews = data_train.iloc[:, 0].values

In [141]:
def pre_processingdata(reviews):
  reviews_processed = []
  for review in reviews:
    review_good_one = ''.join([char for char in review if char not in digits])
    reviews_processed.append(review_good_one)
  word_reviews = []
  clean_reviews = []
  for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review)
  
  for statement in word_reviews:
    clean = []
    for w in statement.split():
      new_w = w.translate(str.maketrans('','','!#$%^&*<>?,./:;"["]{\}_-+='))
      if (new_w!=''):
        clean.append(new_w)
    clean_reviews.append(clean)
  return clean_reviews

In [142]:
data_train = pre_processingdata(reviews)

In [143]:
encoded_labels = []
for label in labels:
    if label == -1:
        encoded_labels.append([1,0,0])
    else:
        encoded_labels.append([0,0,1])
encoded_labels = np.array(encoded_labels)

In [144]:
EMBEDDING_DIM = 400 # HOW BIG IS EACH WORD VECTOR
MAX_VOCAB_SIZE = 10000 # HOW MANY UNIQUE WORDS TO USE
MAX_SEQUENCE_LENGTH = 300 # MAX NUMBER OF WORDS IN A COMMENT TO USE

In [145]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer  

In [146]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(data_train)
sequences_train = tokenizer.texts_to_sequences(data_train)
word_index = tokenizer.word_index

In [147]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [148]:
print("Shape of X_train and X validation tensor:", data.shape)
print("Shape of label train and validation tensor:", labels.shape)

Shape of X_train and X validation tensor: (5307, 300)
Shape of label train and validation tensor: (5307, 3)


In [149]:
labels_test = data_test.iloc[:, 1].values
reviews_test = data_test.iloc[:, 0].values

In [150]:
data_test = pre_processingdata(reviews)

In [151]:
encoded_labels_test = []
for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1,0,0])
    else:
        encoded_labels_test.append([0,0,1])
encoded_labels_test = np.array(encoded_labels_test)
print(labels_test)

[ 1  1  1 ... -1 -1 -1]


In [152]:
word_reviews_test = pre_processingdata(reviews_test)
print(labels_test)

[ 1  1  1 ... -1 -1 -1]


In [153]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test
print(labels_test)

[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [154]:
print("Shape of X_train and X validation tensor:", data_test.shape)
print("Shape of label train and validation tensor:", labels_test.shape)
print(labels_test)

Shape of X_train and X validation tensor: (1971, 300)
Shape of label train and validation tensor: (1971, 3)
[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [155]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load('Voca-vi.bin')

vocabulary_size=min(len(word_index)+1,MAX_VOCAB_SIZE)
print (vocabulary_size)

word_notexist=[]

embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i>=MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors [word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)
        word_notexist.append(word)
  
del (word_vectors)

from keras.layers import Embedding
print(labels_test)
embedding_layer =Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights =[embedding_matrix],
                            trainable=True)
print(labels_test)


10000
[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]
[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [156]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Embedding
from keras.layers import *
from tensorflow.keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout, concatenate 
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.models import Model
from keras import regularizers
from keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow as tf
print(labels_test)


[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [157]:

sequence_length = data.shape[1]
filter_sizes = [3,4,5] 
num_filters = 100
drop = 0.5

inputs = Input(shape=(sequence_length,))
embedding =embedding_layer(inputs)
reshape=Reshape((sequence_length, 1, EMBEDDING_DIM))(embedding)

conv_0 = Conv2D(num_filters, (filter_sizes[0], 1), activation='relu', kernel_regularizer=regularizers.l2(0.01))(reshape) 
conv_1 = Conv2D(num_filters, (filter_sizes[1], 1), activation='relu', kernel_regularizer=regularizers.l2(0.01))(reshape) 
conv_2 = Conv2D(num_filters, (filter_sizes[2], 1), activation='relu', kernel_regularizer=regularizers.l2(0.01))(reshape)

maxpool_0 = MaxPooling2D ((sequence_length - filter_sizes[0]+1,1), strides=(1,1))(conv_0)
maxpool_1 = MaxPooling2D ((sequence_length - filter_sizes[1]+1,1), strides=(1,1))(conv_1)
maxpool_2 = MaxPooling2D ((sequence_length - filter_sizes[2]+1,1), strides=(1,1))(conv_2)

merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=3, activation='softmax', kernel_regularizer=regularizers.l2(0.01))(dropout)

cnn_model = Model(inputs, output)

adam = tf.keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, weight_decay=0.0)
cnn_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['categorical_accuracy'])
cnn_model.summary()
print(labels_test)



Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_4 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 embedding_3 (Embedding)        (None, 300, 400)     4000000     ['input_4[0][0]']                
                                                                                                  
 reshape_6 (Reshape)            (None, 300, 1, 400)  0           ['embedding_3[0][0]']            
                                                                                                  
 conv2d_9 (Conv2D)              (None, 298, 1, 100)  120100      ['reshape_6[0][0]']              
                                                                                            

In [158]:
early_Stopping = EarlyStopping(monitor='loss', min_delta=0.1, patience=4, verbose=1)
callbacks_list = [early_Stopping]
print(labels_test)

[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [159]:
cnn_model.fit(data, labels, validation_data=(data_test, labels_test), epochs=5 ,batch_size=256, callbacks=callbacks_list, shuffle=True, verbose=2)
print(labels_test)

Epoch 1/5
21/21 - 29s - loss: 6.1537 - categorical_accuracy: 0.6017 - val_loss: 4.6486 - val_categorical_accuracy: 0.7164 - 29s/epoch - 1s/step
Epoch 2/5
21/21 - 26s - loss: 4.4969 - categorical_accuracy: 0.7117 - val_loss: 3.7266 - val_categorical_accuracy: 0.7412 - 26s/epoch - 1s/step
Epoch 3/5
21/21 - 26s - loss: 3.4309 - categorical_accuracy: 0.7795 - val_loss: 3.0797 - val_categorical_accuracy: 0.7489 - 26s/epoch - 1s/step
Epoch 4/5
21/21 - 26s - loss: 2.8031 - categorical_accuracy: 0.8020 - val_loss: 2.5864 - val_categorical_accuracy: 0.7595 - 26s/epoch - 1s/step
Epoch 5/5
21/21 - 26s - loss: 2.3072 - categorical_accuracy: 0.8280 - val_loss: 2.2697 - val_categorical_accuracy: 0.7484 - 26s/epoch - 1s/step
[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [1 0 0]
 [1 0 0]
 [1 0 0]]


In [160]:
data_test
prediction = cnn_model.predict(data_test)
label_text = ['Tich cuc','0', 'Tieu cuc']



In [172]:
for i in range (1800,1830):
    print(np.argmax(prediction[i]))

0
0
0
0
0
0
2
0
2
0
0
0
2
0
2
0
0
0
0
0
0
0
0
0
0
0
2
2
0
2
