In [1]:
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

In [2]:
# the dataset path
TEXT_DATA_DIR = r'data'
#the path for Glove embeddings
GLOVE_DIR = r'embed'
# make the max word length to be constant
MAX_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
# the percentage of train test split to be applied
VALIDATION_SPLIT = 0.20
# the dimension of vectors to be used
EMBEDDING_DIM = 100
# filter sizes of the different conv layers 
filter_sizes = [2,3,4]
num_filters = 512
embedding_dim = 100
# dropout probability
drop = 0.5
batch_size = 50
epochs = 20

In [3]:
## preparing dataset


texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                if sys.version_info < (3,):
                    f = open(fpath)
                else:
                    f = open(fpath, encoding='latin-1')
                t = f.read()
                i = t.find('\n\n')  # skip header
                if 0 < i:
                    t = t[i:]
                texts.append(t)
                f.close()
                labels.append(label_id)
# print(labels_index)

# print('Found %s texts.' % len(texts))

In [4]:
import pandas as pd


texts = pd.read_csv('data/yelp_1.csv')[:1000000]

In [5]:
texts

Unnamed: 0.1,Unnamed: 0,text,stars
0,0,apparently prides osteria had a rough summer a...,4
1,1,this store is pretty good not as great as walm...,4
2,2,i called wvm on the recommendation of a couple...,5
3,3,ive stayed at many marriott and renaissance ma...,2
4,4,the food is always great here the service from...,4
...,...,...,...
999995,1000001,this was my first time at seasons 52 and i hav...,4
999996,1000002,ive lived two doors south of here for 6 years ...,5
999997,1000003,love this place i always order the guac burger...,5
999998,1000004,ill be honest i really enjoyed the laser tag i...,4


In [6]:
labels = texts['stars']
texts = texts['text']
print('Labels length: ', len(labels))
print('Text length: ', len(texts))

Labels length:  1000000
Text length:  1000000


In [7]:
tokenizer  = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(texts)
sequences =  tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
print(labels)

unique words : 486228
Shape of data tensor: (1000000, 1000)
Shape of label tensor: (1000000, 6)
[[0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 ...
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]]


In [8]:
# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [9]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf-8')
for line in f:
    line = str(line)
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [10]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [12]:
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)
print(reshape.shape)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=20, activation='softmax')(dropout)
output = Dense(units=6, activation='softmax')(output)
# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


(None, 1000, 100)
(None, 1000, 100, 1)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1000)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1000, 100)    48622900    input_1[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 1000, 100, 1) 0           embedding[0][0]                  
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 999, 1, 512)  102912      reshape[0][0]                    
_______________________________________________________



In [None]:
print("Traning Model...")
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, callbacks=[checkpoint], validation_data=(x_val, y_val))

Traning Model...
Epoch 1/20


 2265/16000 [===>..........................] - ETA: 14:38 - loss: 1.5174 - accuracy: 0.4711- ETA: 16:52 - loss: 1.7279 - accuracy: 0. - ETA: 16:51 - loss: 1.7193 - accuracy: 0.3 - ETA: 16:50 - loss: 1.7131 - accuracy: 0.3 - ETA: 16:49  - ETA: 16:42 - loss: 1.6697 - accuracy: - ETA: 16:42 - loss: 1.6656 - accuracy:  - ETA: - ETA: 16:40 - loss: 1.6537 - accuracy: 0. - ETA: 16:40 - loss: 1.6536 - accuracy: 0. - ETA: 16:40 - loss: 1.6543 - accuracy: 0.42 - ETA: 16:39 - loss: 1.6544 - accurac - ETA: 16:39 - loss: 1.6507 - accuracy: 0.4 - ETA: 16:39 - loss: 1. - ETA: 16:35 - loss: 1.6449 -  - ETA:  - ETA: 16:33 - loss: 1.6409 - accuracy:  - ETA: 16:30 - loss: 1.6369 - accuracy:  - ETA: 16:30 - loss: 1.63 - ETA: 16:26 - loss: 1.6294 - accuracy: 0. - ETA: 16:26 - loss: 1.6290 - accuracy: 0.4 - ETA: 16:26 - loss: 1.6286 - accuracy: - ETA: 16:26 - loss: 1.6278 - accuracy: 0 - ETA: 16:26 - loss: 1.6272 - accuracy: 0.432 - ETA: 16:26 - loss: 1.6270 - accura - E - ETA: 16:23 -  - ETA: 16:21 - loss 

 2730/16000 [====>.........................] - ETA: 14:10 - loss: 1.4961 - accuracy: 0.4835- ETA: 14:38 - loss: 1.516 - ETA: 14:37 - loss: 1.5158 - accur - ETA: 14:36 - loss: 1.5152 - accuracy: 0.472 - ETA: 14:36 - loss: 1.5152 - ac - ETA: 14:35 - loss: 1.5146 - accuracy: 0.4 - ETA: 14:35 - loss: 1.5145 - accuracy: 0.4 - ETA: 14:35 - loss: 1.5144 - accura - ETA: 14:34 - loss: 1.5139 - acc - ETA: 14:33 - loss: 1.5134 - accur - ETA: 14:33 - loss: 1.5129 - - ETA: 14:31 - loss: 1. - ETA: 14:30 - loss: 1.5111 - accuracy: 0.4 - ETA: 14:30 - loss: 1.5109 - accuracy: 0.475 - ETA: 14:30 - loss: 1.5108 - accuracy: 0. - ETA: 14:30 - loss: 1.5106 - accura - ETA: 14:29 -  - ETA: 14:27 - loss: 1.5087 - accu - ETA: 14:26 - loss: 1.5084 - accuracy - ETA: 14:26 - loss: 1.5080 - accuracy: 0 - ETA: 14:26 - loss: 1.5079 - accuracy: 0 - ETA: 14:25 - loss: 1. - ETA: 14:24 - loss: 1.5066 - accuracy: 0.4 - ETA: 14:24 - loss - ETA: 14:22 - loss: 1.5053 - - ETA: 14:21 - loss: 1.5045 - accuracy:  - ETA: 14:21 - 