In [1]:
%cd 'drive/My Drive/NLP_project/'

/content/drive/My Drive/NLP_project


In [2]:
import os
import pickle
import numpy as np
import subprocess
import tensorflow as tf
from define import *
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout
from tensorflow.keras.layers import MaxPooling1D, Softmax, Input, Reshape, Flatten, BatchNormalization
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.preprocessing import sequence
from tensorflow import optimizers
from sklearn.preprocessing import LabelEncoder
from VectorizationWord import FeatureExtraction
from keras.utils import np_utils
from glob import glob
import pandas as pd

Using TensorFlow backend.


In [3]:
print('Loading data...')

train = pd.read_csv(PROCESSED_DATA_TRAIN_CSV)
(X_train, y_train) = (train['content'], train['category'])

val = pd.read_csv(PROCESSED_DATA_VAL_CSV)
(X_val, y_val) = (val['content'], val['category'])

test = pd.read_csv(PROCESSED_DATA_TEST_CSV)
(X_test, y_test) = (test['content'], test['category'])

print('X train: {}'.format(len(X_train)))
print('X val: {}'.format(len(X_val)))
print('X test: {}'.format(len(X_test)))

Loading data...
X train: 22572
X val: 2822
X test: 2821


In [4]:
print("Label Encoder...")

lb = LabelEncoder()
y = lb.fit_transform(y_train)
y_train = np_utils.to_categorical(y)
y = lb.fit_transform(y_val)
y_val = np_utils.to_categorical(y)
print(y_train.shape)
print(y_val.shape)
pickle.dump(lb, open('features/LabelEncoder.p',"wb"))

Label Encoder...
(22572, 6)
(2822, 6)


In [13]:
#@title Vectorization Word Method:
#@markdown
Method ="Word2Vec" #@param ['TF-IDF', 'Word2Vec', 'Weighted Word Vectors']
#@markdown ---
print("Feature Extraction...")

features_train = FeatureExtraction(data=X_train, train = True)
features_test = FeatureExtraction(data=X_test)
features_val = FeatureExtraction(data=X_val)

method_dict = {
    'TF-IDF' : 0,
    'Word2Vec' : 1,
    'Weighted Word Vectors' : 2
}
VectorizationWordMethod = method_dict[Method]

print('Using {} method to Vectorize'.format(Method))

if VectorizationWordMethod == 0:
    x_train = features_train.get_features_tfidf()
    x_test = features_test.get_features_tfidf()
    x_val = features_val.get_features_tfidf()

elif VectorizationWordMethod == 1:
    x_train = features_train.get_features_w2v()
    x_train = np.asarray(x_train)
    x_test = features_test.get_features_w2v()
    x_test = np.asarray(x_test)
    x_val = features_val.get_features_w2v()
    x_val = np.asarray(x_val)
    
elif VectorizationWordMethod == 2:
    x_train = features_train.get_features_wwv()
    x_train = np.asarray(x_train)
    x_test = features_test.get_features_wwv()
    x_test = np.asarray(x_test)   
    x_val = features_val.get_features_wwv()
    x_val = np.asarray(x_val)
else:
    pass

print('X train: {}'.format(x_train.shape))
print('X val: {}'.format(x_val.shape))
print('X test: {}'.format(x_test.shape))

Feature Extraction...
Using Word2Vec method to Vectorize
Building W2V features


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
100%|██████████| 22572/22572 [00:02<00:00, 10732.47it/s]


Building W2V features


100%|██████████| 2821/2821 [00:00<00:00, 10767.95it/s]


Building W2V features


100%|██████████| 2822/2822 [00:00<00:00, 10743.87it/s]


X train: (22572, 40, 300)
X val: (2822, 40, 300)
X test: (2821, 40, 300)


In [14]:
print('Build model...')

inp = Input(shape=(40,300), dtype='float32')
# reshape = Reshape(target_shape=(12,1000))(inp)

stacks = []
for kernel_size in [2, 3, 4]:
    conv = Conv1D(256, kernel_size, padding='same',name='conv1_'+str(kernel_size)+'', activation='relu', strides=1)(inp)
    pool = MaxPooling1D(pool_size=3)(conv)
    stacks.append(pool)

merged = Concatenate()(stacks)

stacks = []
for kernel_size in [2, 3, 4]:
    conv = Conv1D(256, kernel_size, padding='same',name='conv2_'+str(kernel_size)+'', activation='relu', strides=1)(merged)
    pool = MaxPooling1D(pool_size=4)(conv)
    stacks.append(pool)

merged = Concatenate()(stacks)

pool = GlobalMaxPooling1D()(merged)
# flatten = Flatten()(merged)
dense = Dense(128, activation='relu')(pool)
drop = Dropout(0.5)(dense)
outp = Dense(y_train.shape[1], activation='softmax')(drop)

TextCNN = Model(inputs=inp, outputs=outp)

TextCNN.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['accuracy'])
                       
TextCNN.summary()

Build model...
Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, 40, 300)]    0                                            
__________________________________________________________________________________________________
conv1_2 (Conv1D)                (None, 40, 256)      153856      input_4[0][0]                    
__________________________________________________________________________________________________
conv1_3 (Conv1D)                (None, 40, 256)      230656      input_4[0][0]                    
__________________________________________________________________________________________________
conv1_4 (Conv1D)                (None, 40, 256)      307456      input_4[0][0]                    
_____________________________________________________________________________

In [0]:
temp = 6

In [15]:
print('Train...')
file_path = "model/bestcp_W2V"+str(temp)+"_{val_accuracy:2.2%}.h5"

check_point = ModelCheckpoint(file_path,
                            monitor="val_accuracy",
                            verbose=1,
                            save_best_only=True,
                            mode="max")

early_stop = EarlyStopping(monitor="val_accuracy",
                        mode="max",
                        patience=5)

textcnn_history = TextCNN.fit(x_train, 
                            y_train, 
                            batch_size=128, 
                            epochs=50,
                            validation_data=(x_val, y_val),
                            callbacks=[check_point, early_stop])


h5paths = glob('model/bestcp_W2V'+str(temp)+'_*.h5')
for path in h5paths[0:-1]:
    subprocess.call(["rm", "-f", path])
    print('deleted model: {}'.format(path))
print('best model: {}'.format(h5paths[-1]))
temp += 1



Train...
Epoch 1/50
Epoch 00001: val_accuracy improved from -inf to 0.93125, saving model to model/bestcp_W2V8_93.13%.h5
Epoch 2/50
Epoch 00002: val_accuracy improved from 0.93125 to 0.93161, saving model to model/bestcp_W2V8_93.16%.h5
Epoch 3/50
Epoch 00003: val_accuracy improved from 0.93161 to 0.94472, saving model to model/bestcp_W2V8_94.47%.h5
Epoch 4/50
Epoch 00004: val_accuracy improved from 0.94472 to 0.94791, saving model to model/bestcp_W2V8_94.79%.h5
Epoch 5/50
Epoch 00005: val_accuracy did not improve from 0.94791
Epoch 6/50
Epoch 00006: val_accuracy improved from 0.94791 to 0.95216, saving model to model/bestcp_W2V8_95.22%.h5
Epoch 7/50
Epoch 00007: val_accuracy did not improve from 0.95216
Epoch 8/50
Epoch 00008: val_accuracy did not improve from 0.95216
Epoch 9/50
Epoch 00009: val_accuracy did not improve from 0.95216
Epoch 10/50
Epoch 00010: val_accuracy did not improve from 0.95216
Epoch 11/50
Epoch 00011: val_accuracy did not improve from 0.95216
deleted model: model/

In [21]:
print('Test...')
h5paths = glob('model/bestcp_W2V*.h5')
for path in h5paths:
    model = load_model(path)
    y_pred = model.predict(x_test).argmax(axis=1)
    y_pred = lb.inverse_transform(y_pred)
    acc = np.average(y_pred == y_test)
    print("accuracy of model {} is {:2.2%}".format(path[13:-3], acc))

Test...
accuracy of model W2V3_95.50% is 94.61%
accuracy of model W2V4_95.32% is 94.22%
accuracy of model W2V5_94.51% is 94.22%
accuracy of model W2V1_95.43% is 94.01%
accuracy of model W2V8_95.22% is 94.36%
