In [370]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import random

from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import load_model

from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

import os
import string
import re

In [371]:
bbc_business = os.path.sep.join(["data_set","bbc","business"])
bbc = os.path.sep.join(["data_set","bbc"]) 
WORD_EMBEDDING_DIMENSION = 50
MAX_VOCAB_SIZE = 10000
MAX_LENGTH = 500
NUM_CLASSES = 5

In [392]:
contents = []
labels = []

def preprocess_data(folder_path):
    for i, (dir_path, dir_names, file_names) in enumerate(os.walk(folder_path)):
        if dir_path != os.path.sep.join(["data_set","bbc"]):
            print(f"{len(file_names)} files in {dir_path} have been loaded")
            for file_name in file_names:
                file_path = os.path.sep.join([dir_path, file_name])
                category = file_path.split(os.path.sep)[-2]
                with open(file_path, "r", encoding="ISO-8859-1") as f:
                    content = f.read().strip()
                    for punc in string.punctuation:
                        content = content.replace(punc,"")
                    content = re.sub(r"(\n)+", " ", content)
                    content = content.lower()
                    
                    contents.append(content)
                    labels.append(category)
                    
preprocess_data(bbc)

510 files in data_set\bbc\business have been loaded
386 files in data_set\bbc\entertainment have been loaded
417 files in data_set\bbc\politics have been loaded
511 files in data_set\bbc\sport have been loaded
401 files in data_set\bbc\tech have been loaded


In [373]:
print(f"we have total of {len(contents)} training data")

nums=np.array([len(content.split()) for content in contents])

max_num_of_words = np.max(nums)
min_num_of_words = np.min(nums)
total_num_of_words = np.sum(nums)
average_num_of_words = total_num_of_words//len(contents)

for threshold in [500,600,700,800,900,1000,1200,1400,1600]:
    print(f"{len([num for num in nums if num < threshold])} of paragraph has number of words less than {threshold}")

print(f"max number of words: {max_num_of_words}")
print(f"min number of words: {min_num_of_words}")
print(f"number of words: {total_num_of_words}")
print(f"average number of words: {average_num_of_words}")

we have total of 2225 training data
1764 of paragraph has number of words less than 500
1982 of paragraph has number of words less than 600
2095 of paragraph has number of words less than 700
2146 of paragraph has number of words less than 800
2191 of paragraph has number of words less than 900
2203 of paragraph has number of words less than 1000
2210 of paragraph has number of words less than 1200
2216 of paragraph has number of words less than 1400
2217 of paragraph has number of words less than 1600
max number of words: 4416
min number of words: 89
number of words: 851028
average number of words: 382


In [386]:
X_train, X_test, Y_train, Y_test = train_test_split(contents, labels, test_size=0.15)

In [375]:
labelBinarizer = LabelBinarizer()
Y_train = labelBinarizer.fit_transform(Y_train)
Y_test = labelBinarizer.transform(Y_test)

In [376]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)

tokenizer.fit_on_texts(X_train)
training_word_to_index = tokenizer.word_index

sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_test =  tokenizer.texts_to_sequences(X_test)

X_train = np.array(sequence.pad_sequences(sequences_train, maxlen=MAX_LENGTH, padding='post'))
X_test = np.array(sequence.pad_sequences(sequences_test, maxlen=MAX_LENGTH, padding='post'))

In [377]:
def define_embedding_layer():
    print("Loading word vectors...")
    word_to_vec = {}
    embedding_file_path = os.path.sep.join(["word_embedding", "glove.6B.{}d.txt".format(WORD_EMBEDDING_DIMENSION)])
    with open(embedding_file_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.array(values[1:], dtype="float32")
            word_to_vec[word] = vec
    
    vocab_size = max(
        MAX_VOCAB_SIZE,
        len(training_word_to_index) + 1
    )
    embedding_matrix = np.zeros((vocab_size, WORD_EMBEDDING_DIMENSION))

    # for embedding matrix, we are just interested in words in our training set:
    for word, index in training_word_to_index.items():
        word_vec = word_to_vec.get(word)
        if word_vec is not None:
            embedding_matrix[index] = word_vec

    training_word_embedding_layer = Embedding(
        vocab_size,
        WORD_EMBEDDING_DIMENSION,
        weights=[embedding_matrix],
        input_length=MAX_LENGTH
    )
    
    print("Done!")
    return encoder_word_embedding_layer

training_word_embedding_layer = define_embedding_layer()

Loading word vectors...
Done!


In [379]:
print(X_train[0])

[2855   11    1 2856 1317  343 5963   70    3    1  186   10  506   15
  633 4541 3831   37  784    1  704 2126    1  113   49    8    5 2648
    9  395    8  786  147 8250    4 4000 5964  133    2  270    2 4001
  398 6385  393   55  313    6 1172   52    3  213   49    8   82  270
 3326  339 2855   83  113  253 1674  591  453   15 1522 4747 6896   60
  210 3832 6385   99   31 1030  270   23   15 2261 4164 6896  268    9
   12   35  412    2   92   44  317  139   16    1  786 4360   72  186
  441 8251  186    2  271 1675   10  398 4967  537   10 5598    9   19
  787    6    1  315 4542 9181   40 7479 6897   99 5965  270 1052  249
  398    2  335 4543    4 4361 1675   22 1676    4  321  335  597  704
    2  271 5247    7  330 2921    3 6385 2033   59  182 1916 1029   43
 3549 2310 1586   23    9   13    5 1766 1917   36   13 3103  167   17
   12   13   26  260   17   87    6    1 9182 1658    5  252   11    1
 1767    3    1   46 1545   13    2  788    5 3550  858    3 2034   90
   17 

In [380]:
def build_model():
    inputs = Input(name='inputs',shape=(MAX_LENGTH,))
    x = training_word_embedding_layer(inputs)
    x = LSTM(128)(x)
    x = Dense(256,name='FC1')(x)
    x = Activation('relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(0.5)(x)
    
#     x = LSTM(128)(x)
#     x = Dense(256,name='FC2')(x)
#     x = Activation('relu')(x)
#     x = BatchNormalization()(x)
#     x = Dropout(0.5)(x) 

    x = Dense(NUM_CLASSES, name='out_layer')(x)
    x = Activation('softmax')(x)
    model = Model(inputs=inputs, outputs=x)
    return model

model = build_model()

In [381]:
from tensorflow.keras.utils import plot_model 
plot_model(model, to_file='model1.png')
model.compile(
    loss='categorical_crossentropy', 
    optimizer=RMSprop(), 
    metrics=['acc']
)

In [382]:
print(X_train.shape)
print(len(Y_train))
print(X_test.shape)
print(len(Y_test))

(1891, 500)
1891
(334, 500)
334


In [383]:
model.fit(X_train, 
          Y_train,
          validation_data = (X_test, Y_test),
          batch_size=64,
          epochs=50
         )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x230772630d0>

In [384]:
model.save("./output/classifier.hdf5")
plot_model(model, to_file='model2.png')
# saving
with open('./output/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [387]:
model = load_model("./output/classifier.hdf5")

tokenizer = None
with open('./output/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [388]:
tokenizer_index_word = {}
for word, index in tokenizer.word_index.items():
    tokenizer_index_word[index] = word

In [389]:
def predict_paragraph_category(paragraph):
    seq = np.array(sequence.pad_sequences(tokenizer.texts_to_sequences([paragraph]), maxlen=MAX_LENGTH, padding='post'))
    probabilities = model.predict(seq)
    return labelBinarizer.classes_[np.argmax(probabilities)]

In [390]:
predict_paragraph_category(X_train[10])

'business'

In [396]:
for index in random.sample(range(0, 2000), 10):
    content = contents[index]
    label = labels[index]
    print(index)
    print("prediction", predict_paragraph_category(content))
    print("answer", label)
    print("------------")

76
prediction business
answer business
------------
354
prediction business
answer business
------------
772
prediction entertainment
answer entertainment
------------
816
prediction entertainment
answer entertainment
------------
468
prediction business
answer business
------------
982
prediction politics
answer politics
------------
51
prediction business
answer business
------------
828
prediction entertainment
answer entertainment
------------
536
prediction entertainment
answer entertainment
------------
672
prediction entertainment
answer entertainment
------------
