## **Import the needed packages**

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
import os, re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.corpus import words
from sklearn.feature_extraction.text import CountVectorizer
import string
import keras
import tensorflow

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manderson\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manderson\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Using TensorFlow backend.


## **Load the dataset**

In [8]:
#-- The BBC dataset
news_data = pd.read_csv('articles.csv')

#-- let's look at some of the data
print("Number of rows : " + str(df.shape[0]))
news_data.sample(10)

Number of rows : 2225


Unnamed: 0,news,type
302,Bank holds interest rate at 4.75%\n \n The Ban...,business
1521,Moya suffers shock loss\n \n Fifth seed Carlos...,sport
1391,Mexicans tracking unhappy Juninho\n \n Mexican...,sport
1187,Galloway plea for hostage release\n \n Ex-Labo...,politics
322,Karachi stocks hit historic high\n \n The Kara...,business
1655,Liverpool pledge to keep Gerrard\n \n Liverpoo...,sport
1968,Security warning over 'FBI virus'\n \n The US ...,tech
684,Branson show flops on US screens\n \n Entrepre...,entertainment
1443,Jones doping probe begins\n \n An investigatio...,sport
347,Banker loses sexism claim\n \n A former execut...,business


## **Clean our dataset**

In [9]:
# data cleaning
def clean_line(t):
    return (t.replace(' \n ',' ')
            .replace('\r',' ')
            .replace('\t',' ')
            .replace('  ',' ')
            .strip().lower())

#-- initialize the lemmatizer
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from nltk import tokenize

paras = []
labels = []
texts = []
for raw_doc in news_data['news']:
    text = clean_line(raw_doc)
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    paras.append(sentences)

## **TD-IDF Weight**

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

#-- function to create document term matrix
def createDTM(text):
    vect = TfidfVectorizer()
    dtm = vect.fit_transform(text)
    #-- create pandas dataframe of DTM
    return pd.DataFrame(dtm.toarray(), columns=vect.get_feature_names())

#-- let's take a look at the dtm
documents = createDTM(news_data['news'])

# **GLoVe Embeddings**

In [None]:
WORD_SIZE = 300

embeddings_index = {}
f = open(os.path.join(os.getcwd(), 'glove.840B.300d.txt'), encoding='UTF-8')

embeddings_index =  {line.split(' ')[:1][0]: line.split(' ')[1:WORD_SIZE + 1] for line in f if line.split(' ')}
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# **Preprocessng with Keras**

In [15]:
MAX_SENTENCE_NUM = 9
MAX_WORD_NUM = 40
MAX_FEATURES = 200000

tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token=True)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index

# create raw data by mapping each word to its embedding*tf-idf weighting
data = np.zeros((len(texts), MAX_SENTENCE_NUM, MAX_WORD_NUM, WORD_SIZE), dtype='float32')
for i, document in enumerate(paras):
    wordCount = 0
    for j, sentence in enumerate(document):
        if j < MAX_SENTENCE_NUM:
            words = text_to_word_sequence(sentence)
            for k, word in enumerate(words):
                wordCount += 1
                if k < MAX_WORD_NUM:
                    word_emedding = np.zeros((WORD_SIZE), dtype='float32')
                    if tokenizer.word_index[word] < MAX_FEATURES and word in embeddings_index:
                        word_emedding = embeddings_index[word] * documents[word].iloc[i]
                    data[i,j,k] = word_emedding

NameError: name 'embeddings_index' is not defined

In [8]:
topics = pd.get_dummies(news_data['Label'])
print('Shape of data tensor:', data.shape)
print('Shape of topics tensor:', topics.shape)

Shape of data tensor: (2225, 9, 40)
Shape of topics tensor: (2225, 5)


In [9]:
# train-validation split
VALIDATION_SPLIT = 0.2
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
topics = topics.iloc[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = topics[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = topics[-nb_validation_samples:]

actual_labels = idxmax.idxmax(axis = 1).map({'Business': 0, 'Entertainment': 1, 'Politics': 2, 'Sports' : 3, 'Technology' : 4})

# **Attention Layer**

In [12]:
import tensorflow as tf
import os
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras import backend as K

class AttentionLayer(Layer):
    """
    Hierarchial Attention Layer as described by Hierarchical Attention Networks for Document Classification(2016)
    - Yang et. al.
    Source: https://www.cs.cmu.edu/~hovy/papers/16HLT-hierarchical-attention-networks.pdf
    Theano backend
    """
    def __init__(self,attention_dim=100,return_coefficients=False,**kwargs):
        # Initializer 
        self.supports_masking = True
        self.return_coefficients = return_coefficients
        self.init = initializers.get('glorot_uniform') # initializes values with uniform distribution
        self.attention_dim = attention_dim
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # Builds all weights
        # W = Weight matrix, b = bias vector, u = context vector
        assert len(input_shape) == 3
        # casting "Dimension" type of input_shape's elements to "int" type
        #input_shape = tuple([i if (isinstance(i, float) or i is None else i.value for i in input_shape)]) 
        self.W = K.variable(self.init((input_shape[-1].value, self.attention_dim)),name='W')
        self.b = K.variable(self.init((self.attention_dim, )),name='b')
        self.u = K.variable(self.init((self.attention_dim, 1)),name='u')
        self._trainable_weights = [self.W, self.b, self.u]

        super(AttentionLayer, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, hit, mask=None):
        # Here, the actual calculation is done
        uit = K.bias_add(K.dot(hit, self.W),self.b)
        uit = K.tanh(uit)
        
        ait = K.dot(uit, self.u)
        ait = K.squeeze(ait, -1)
        ait = K.exp(ait)
        
        if mask is not None:
            ait *= K.cast(mask, K.floatx())

        ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        ait = K.expand_dims(ait)
        weighted_input = hit * ait
        
        if self.return_coefficients:
            return [K.sum(weighted_input, axis=1), ait]
        else:
            return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        if self.return_coefficients:
            return [(input_shape[0], input_shape[-1]), (input_shape[0], input_shape[-1], 1)]
        else:
            return input_shape[0], input_shape[-1]

# **Model**

In [None]:
EMBED_SIZE = 300
import tensorflow.keras
from tensorflow.python.keras.layers import Embedding
from tensorflow.python.keras.layers import Input
from tensorflow.python.keras.layers import Bidirectional, GRU, Dense
from tensorflow.python.keras.models import Model
from tensorflow.python.keras import initializers
from tensorflow.python.keras.layers import TimeDistributed
from tensorflow.python.keras.layers import Dropout
embedding_layer = Embedding(len(word_index) + 1,embed_size,weights=[embedding_matrix], 
                            input_length=MAX_WORD_NUM, trainable=False,name='word_embedding')

# Words level attention model
word_input = Input(shape=(MAX_WORD_NUM, WORD_SIZE,), dtype='float32',name='word_input')
word_gru = Bidirectional(GRU(50, return_sequences=True),name='word_gru')(word_input)
word_dense = Dense(100, activation='relu', name='word_dense')(word_gru) 
word_att,word_coeffs = AttentionLayer(EMBED_SIZE,True,name='word_attention')(word_dense)
wordEncoder = Model(inputs = word_input,outputs = word_att)


# Sentence level attention model
sent_input = Input(shape=(MAX_SENTENCE_NUM, MAX_WORD_NUM, WORD_SIZE), dtype='float32',name='sent_input')
print("sent_input ",sent_input)
sent_encoder = TimeDistributed(wordEncoder,name='sent_linking')(sent_input)
sent_gru = Bidirectional(GRU(50, return_sequences=True),name='sent_gru')(sent_encoder)
sent_dense  = Dense(100, activation='relu', name='sent_dense')(sent_gru) 
sent_att,sent_coeffs = AttentionLayer(EMBED_SIZE,return_coefficients=True,name='sent_attention')(sent_dense)
sent_drop = Dropout(0.5,name='sent_dropout')(sent_att)
preds = Dense(5, activation='softmax',name='output')(sent_drop)

# Model compile
model = Model(sent_input, preds)
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])
print(wordEncoder)
print("-------------------")
print(model.summary())

In [None]:
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=50, batch_size=400)

In [None]:
# summarize history for accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')