# **Combine Word2Vec along with CNN for Text Classification**

## **Importing Libraries**

In [1]:
try:
    import tensorflow as tf
except ImportError:
    !pip install tensorflow
    import tensorflow as tf

try:
    import pandas as pd
except ImportError:
    !pip install pandas
    import pandas as pd

try:
    import numpy as np
except ImportError:
    !pip install numpy
    import numpy as np

try:
    from string import digits
except ImportError:
    !pip install string
    from string import digits

try:
    from collections import Counter
except ImportError:
    !pip install collections
    from collections import Counter

try:
    from pyvi import ViTokenizer
except ImportError:
    !pip install pyvi
    from pyvi import ViTokenizer

try:
    from gensim.models.word2vec import Word2Vec
except ImportError:
    !pip install gensim
    from gensim.models.word2vec import Word2Vec

try:
    from keras.utils import to_categorical
except ImportError:
    !pip install keras
    from keras.utils import to_categorical
%matplotlib inline

In [2]:
from utils.helper_function import *

## **Downloading the Dataset**

In [7]:
URLs = {
    "https://drive.google.com/file/d/1q3myiaORcL3fbeks8ExZZcqefFtHthPD/view?usp=drive_link": "datasets/vlsp_sentiment_train.csv",
    "https://drive.google.com/file/d/1jofip_UbAXzzJwrqacVTJ7183mmpBQXe/view?usp=drive_link": "datasets/vlsp_sentiment_test.csv",
}

for key, value in URLs.items():
    download_data(key, value)

Downloading...
From: https://drive.google.com/uc?id=1q3myiaORcL3fbeks8ExZZcqefFtHthPD
To: e:\General_Subjects\Natural Language Processing\Lab-NLP\datasets\vlsp_sentiment_train.csv
100%|██████████| 858k/858k [00:00<00:00, 1.78MB/s]
Downloading...
From: https://drive.google.com/uc?id=1jofip_UbAXzzJwrqacVTJ7183mmpBQXe
To: e:\General_Subjects\Natural Language Processing\Lab-NLP\datasets\vlsp_sentiment_test.csv
100%|██████████| 159k/159k [00:00<00:00, 637kB/s]


Get dataset

In [3]:
data_train = pd.read_csv("datasets/vlsp_sentiment_train.csv", sep='\t')
data_train.columns =['Class', 'Data']
data_test = pd.read_csv("datasets/vlsp_sentiment_test.csv", sep='\t')
data_test.columns =['Class', 'Data']

In [4]:
print(data_train.shape)
print(data_test.shape)

(5100, 2)
(1050, 2)


In [5]:
labels = data_train.iloc[:, 0].values
reviews = data_train.iloc[:, 1].values

### One-hot encoding the labels

In [6]:
encoded_labels = []

for label in labels:
    if label == -1:
        encoded_labels.append([1,0,0])
    elif label == 0:
        encoded_labels.append([0,1,0])
    else:
        encoded_labels.append([0,0,1])

encoded_labels = np.array(encoded_labels)  

In [7]:
reviews_processed = []
unlabeled_processed = [] 
for review in reviews:
    review_cool_one = ''.join([char for char in review if char not in digits])
    reviews_processed.append(review_cool_one)

Use PyVi for Vietnamese word tokenizer

In [8]:
word_reviews = []
all_words = []
for review in reviews_processed:
    review = ViTokenizer.tokenize(review.lower())
    word_reviews.append(review.split())
   

Define the parameters:
- `EMBEDDING_DIM` is the dimension of the word embeddings. It's usually set to 100, 200, 300 or higher. The higher the dimension, the more context the word embeddings can capture, but the more computationally expensive it is to train the model.
- `MAX_SEQUENCE_LENGTH` is the maximum length of the text sequences. Text sequences that are shorter than this are padded with zeros, and sequences that are longer are truncated to this length. This is done to ensure that the input to the model has a consistent shape.
- `VOCAB_SIZE` is the size of the vocabulary. This is the number of unique words in the dataset's vocabulary. It is used to specify the input size of the embedding layer.

In [9]:
EMBEDDING_DIM = 400 # how big is each word vector
MAX_VOCAB_SIZE = 10000 # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = 300 # max number of words in a comment to use

In [10]:
try:
    from tensorflow.keras.preprocessing.text import Tokenizer
except ImportError:
    !pip install tensorflow
    from tensorflow.keras.preprocessing.text import Tokenizer

try:
    from tensorflow.keras.preprocessing.sequence import pad_sequences
except ImportError:
    !pip install tensorflow
    from tensorflow.keras.preprocessing.sequence import pad_sequences

try:
    from tensorflow.keras.utils import to_categorical
except ImportError:
    !pip install tensorflow
    from tensorflow.keras.utils import to_categorical

Tokenize the text data

In [11]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(word_reviews)
sequences_train = tokenizer.texts_to_sequences(word_reviews)
word_index = tokenizer.word_index

Pad the sequences

In [12]:
data = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH)
labels = encoded_labels

In [13]:
print('Shape of X train and X validation tensor:', data.shape)
print('Shape of label train and validation tensor:', labels.shape)

Shape of X train and X validation tensor: (5100, 300)
Shape of label train and validation tensor: (5100, 3)


Now, we will creaate Word2Vec model

In [18]:
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors

CHECKPOINT = 'checkpoints/vi-model-CBOW.bin'
if not os.path.exists(CHECKPOINT):
    # https://drive.google.com/file/d/1ibVpNvQci2T-phUeV8aT8kfFd8eRkyqL/view?usp=sharing
    download_data(url='https://drive.google.com/file/d/1ibVpNvQci2T-phUeV8aT8kfFd8eRkyqL/view?usp=sharing', output_path=CHECKPOINT, fuzzy=True)

word_vectors = KeyedVectors.load_word2vec_format(CHECKPOINT, binary=True)

In [19]:
vocabulary_size=min(len(word_index)+1,MAX_VOCAB_SIZE)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=MAX_VOCAB_SIZE:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

Create embedding layer.

Actually, we can use the pre-trained Word2Vec model to create the embedding layer. However, in this notebook, we will train the Word2Vec model from scratch.

In [23]:
from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [24]:
from tensorflow.keras.layers import Dense, Input, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Dropout,concatenate
from tensorflow.keras.layers import Reshape, Flatten
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

In [25]:
sequence_length = data.shape[1]
filter_sizes = [3,4,5]
num_filters = 100
drop = 0.5

In [27]:
inputs = Input(shape=(sequence_length,))
embedding = embedding_layer(inputs)
# reshape = Reshape((sequence_length,EMBEDDING_DIM,1))(embedding)

Create Conv1D layes with:
- activation function: ReLU
- kernel regularizer: L2

In [31]:
conv_0 = Conv1D(num_filters, filter_sizes[0],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
conv_1 = Conv1D(num_filters, filter_sizes[1],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)
conv_2 = Conv1D(num_filters, filter_sizes[2],activation='relu',kernel_regularizer=regularizers.l2(0.01))(embedding)

Create max pooling layer

In [32]:
maxpool_0 = MaxPooling1D(sequence_length - filter_sizes[0] + 1, strides=1)(conv_0)
maxpool_1 = MaxPooling1D(sequence_length - filter_sizes[1] + 1, strides=1)(conv_1)
maxpool_2 = MaxPooling1D(sequence_length - filter_sizes[2] + 1, strides=1)(conv_2)

Step by step, we will create the stacked layers to build the model

In [33]:
merged_tensor = concatenate([maxpool_0, maxpool_1, maxpool_2], axis=1)
flatten = Flatten()(merged_tensor)
reshape = Reshape((3*num_filters,))(flatten)
dropout = Dropout(drop)(flatten)
output = Dense(units=3, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)

In [35]:
# this creates a model that includes
model = Model(inputs, output)

adam = Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [36]:
model.summary()

Define early stopping

In [37]:
#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=4, verbose=1)
callbacks_list = [early_stopping]

In [40]:
model.fit(data, labels, validation_split=0.2,
          epochs=10, batch_size=256, callbacks=callbacks_list, shuffle=True)

Epoch 1/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 962ms/step - accuracy: 0.4027 - loss: 7.8806 - val_accuracy: 0.1745 - val_loss: 6.3770
Epoch 2/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 1s/step - accuracy: 0.6195 - loss: 5.6594 - val_accuracy: 0.0853 - val_loss: 6.7725
Epoch 3/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.7140 - loss: 4.6781 - val_accuracy: 0.0510 - val_loss: 6.3010
Epoch 4/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.7847 - loss: 4.0193 - val_accuracy: 0.1127 - val_loss: 5.3863
Epoch 5/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.8451 - loss: 3.4938 - val_accuracy: 0.0686 - val_loss: 5.1837
Epoch 6/10
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 1s/step - accuracy: 0.8796 - loss: 3.0589 - val_accuracy: 0.0873 - val_loss: 4.8096
Epoch 7/10
[1m16/16[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x27946f51a90>

Now let's test the model

In [42]:
labels_test = data_test.iloc[:, 0].values
reviews_test = data_test.iloc[:, 1].values

Also need to one-hot encode the labels for the test set

In [44]:
encoded_labels_test = []

for label_test in labels_test:
    if label_test == -1:
        encoded_labels_test.append([1,0,0])
    elif label_test == 0:
        encoded_labels_test.append([0,1,0])
    else:
        encoded_labels_test.append([0,0,1])

encoded_labels_test = np.array(encoded_labels_test)  

In [45]:
reviews_processed_test = []
unlabeled_processed_test = [] 
for review_test in reviews_test:
    review_cool_one = ''.join([char for char in review_test if char not in digits])
    reviews_processed_test.append(review_cool_one)

In [53]:
# Use PyVi for Vietnamese word tokenizer
word_reviews_test = []
all_words = []
for review_test in reviews_processed_test:
    review_test = ViTokenizer.tokenize(review_test.lower())
    word_reviews_test.append(review_test.split())

In [47]:
sequences_test = tokenizer.texts_to_sequences(word_reviews_test)
data_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
labels_test = encoded_labels_test

In [48]:
print('Shape of X train and X validation tensor:',data_test.shape)
print('Shape of label train and validation tensor:', labels_test.shape)

Shape of X train and X validation tensor: (1050, 300)
Shape of label train and validation tensor: (1050, 3)


In [49]:
score = model.evaluate(data_test, labels_test)

[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.7606 - loss: 2.0981


Let's check the metrics. Currently we support 2 metrics: loss aand compile_metrics
- loss: This is the objective that the model will try to minimize. In this case, we are using binary crossentropy loss, which is suitable for binary classification problems.
- compile_metrics: This is a list of metrics that will be computed for the model. In this case, we are using accuracy as the metric.

In [52]:
print("%s: %.2f%%" % (model.metrics_names[0], score[0]*100))
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

loss: 248.18%
compile_metrics: 59.52%
