In [0]:
import numpy as np # matrix manuplation library
np.random.seed(1000)
import tensorflow as tf # tensorflow gpu or cpu based neural network library
import string
import os

In [1]:
!wget "https://raw.githubusercontent.com/StarBoy01/IndabaX-Sudan-2019/master/nlp/TextClassification.zip"
!unzip "TextClassification.zip"

--2019-10-25 07:54:32--  https://raw.githubusercontent.com/StarBoy01/IndabaX-Sudan-2019/master/nlp/TextClassification.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24064147 (23M) [application/zip]
Saving to: ‘TextClassification.zip’


2019-10-25 07:54:33 (226 MB/s) - ‘TextClassification.zip’ saved [24064147/24064147]

Archive:  TextClassification.zip
   creating: bbc/
   creating: bbc/business/
  inflating: bbc/business/001.txt    
  inflating: bbc/business/002.txt    
  inflating: bbc/business/003.txt    
  inflating: bbc/business/004.txt    
  inflating: bbc/business/005.txt    
  inflating: bbc/business/006.txt    
  inflating: bbc/business/007.txt    
  inflating: bbc/business/008.txt    
  inflating: bbc/business/009.txt    
  inflating: bbc/business/0

In [0]:
# these are the 5 classes in which the BBC news are organized in. 
# the folders in the datatsets are named after these classes
folders = ['tech', 'business', 'entertainment', 'sport', 'politics'] 

In [0]:
# a dictionary to collect all the data under the topic of the news
# the key of the dictionary will be the type of the news
# the value of the dictionary is a list of similar news under a topic
data = {f:[] for f in folders}

In [0]:
# Vocab is a dictionary that contains the words in the BBC dataset.
# To clean up the dataset, we only remove all the panctuation.
# The loop will collect all news textunder the data dictionary from all folders
vocab = {}
exclude = set(string.punctuation)
for folder in folders:
    folder_path = '/content/bbc/' + folder
    files = os.listdir(folder_path)
    for file in files:
        file_path = folder_path + "/" + file
        text = open(file_path, encoding = 'unicode_escape').readlines()
        text = ''.join(ch for ch in text if ch not in exclude)
        data[folder].append(text)
        for word in text.split(' '):
            vocab[word] = word
            

In [0]:
word2vec= {}
wordvec_file = open('bbc_vec', encoding='utf-8')
v, e = wordvec_file.readline()[:-1].split()
vocab_size, embed_size = int(v), int(e)
for i in range(vocab_size):
    line = wordvec_file.readline()
    line = line[:-1].split(' ')
    word = line[0]
    vec = [float(x) for x in line[1:]]
    word2vec[word] = np.array(vec)


In [0]:
def get_mean(words, word2vec):
    """
    Given a set of words, it returns the mean of the vectors of these words' vectors
    words: list of words
    word2vec: a dictionary that contains key value pair of word and their vectors
    """
    vecs = []
    for word in words:
        if word in word2vec:
            vecs.append(word2vec[word])
    if len(vecs) == 0:
        return None
    else:
        vec = np.mean(np.vstack(vecs), axis=0)
        return vec / np.linalg.norm(vec)
        

In [0]:
# X is a list that will contain vectors words. Y will contain their respective 
# labels these textes.
X = []
Y = []
for key in data: # for each topic of the news
    topic = key
    news = data[key]
    for new in news: # for each news in the selected topic
        words = new.split(' ')
        vec = get_mean(words, word2vec)
        if vec is not None:
            label = folders.index(topic)
            yl = [0]*len(folders)
            yl[label] = 1
            Y.append(yl)
            X.append(vec)
X = np.vstack(X)
Y = np.array(Y, dtype=np.int32) 

In [11]:
# randomize the dataset and split it into training and test set
indexes = np.arange(len(X), dtype=np.int32)
np.random.shuffle(indexes)
X = X[indexes]
Y = Y[indexes]
n_train = int(.7 * X.shape[0])
n_test = X.shape[0] - n_train
print("Train Set: ", n_train, "Test Set: ", n_test)
x_train, y_train, x_test, y_test = X[:n_train], Y[:n_train], X[n_train:], Y[n_train:]

Train Set:  1557 Test Set:  668


In [0]:
# a generator function that generates a pair of vector and its label when the generator is called
def gen_data(X, Y, batch_size=32):
    indexes = np.arange(len(X))
    current = 0
    while True:
        bs = indexes[current:current + batch_size]
        batch_x = X[bs]
        batch_y = Y[bs]
        current += batch_size
        if current > len(X):
            np.random.shuffle(indexes)
            current = 0
        yield batch_x, batch_y


In [0]:
batch_size = 32
steps_in_epoch = len(x_train) // batch_size
gen = gen_data(x_train, y_train, batch_size)

In [14]:
def base_model():
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Dense(5, input_dim=embed_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='SGD', metrics=['accuracy'])
    return model
model = base_model()

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [15]:
model.fit_generator(gen, steps_per_epoch=steps_in_epoch, epochs=100, validation_data=(x_test, y_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7feefcb117f0>

In [0]:
def predict(text):
    vec = get_mean(text.split(' '), word2vec)
    vec = vec.reshape((1, embed_size))
    probs = model.predict(vec)
    label = probs.argmax()
    return folders[label]

In [17]:
predict("this is a test news with atheletics")

'sport'