In [25]:
import numpy as np
import os.path
from gensim.models import KeyedVectors
import time
from gensim.models import Word2Vec
import string
import train_embeddings_bbc
import preprocessing_bbc
import prepare_bbc_data
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.initializers import RandomNormal
from keras.regularizers import L2
from utils import create_inf_sents, featurize_X_from_text, featurize_embed_from_df

In [2]:
train_file = preprocessing_bbc.cleaned_train_f
test_file = preprocessing_bbc.cleaned_test_f
val_file = preprocessing_bbc.cleaned_val_f

wv_model = train_embeddings_bbc.load_gensim(train_embeddings_bbc.model_file)
wv_from_text = train_embeddings_bbc.load_embeddings(train_embeddings_bbc.embedding_file)

In [3]:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
val_data = pd.read_csv(val_file)

In [4]:
EMBED_SIZE = train_embeddings_bbc.EMBEDDINGS_SIZE

In [5]:
MAX_WORD_SIZE = preprocessing_bbc.MAX_THRESH

In [6]:
train_X = np.zeros((len(train_data), (EMBED_SIZE * MAX_WORD_SIZE)))
test_X = np.zeros((len(test_data), (EMBED_SIZE * MAX_WORD_SIZE)))
val_X = np.zeros((len(val_data), (EMBED_SIZE * MAX_WORD_SIZE)))

train_Y = np.zeros((len(train_data), 1))
test_Y = np.zeros((len(test_data), 1))
val_Y = np.zeros((len(val_data), 1))

In [7]:
print(train_X.shape)
print(test_X.shape)
print(val_X.shape)

(32722, 3250)
(3872, 3250)
(4725, 3250)


In [8]:
featurize_embed_from_df(train_data, train_X, train_Y, wv_from_text)
featurize_embed_from_df(test_data, test_X, test_Y, wv_from_text)
featurize_embed_from_df(val_data, val_X, val_Y, wv_from_text)

In [9]:
print(train_X.shape)
print(test_X.shape)
print(val_X.shape)

print(train_Y.shape)
print(test_Y.shape)
print(val_Y.shape)

(32722, 3250)
(3872, 3250)
(4725, 3250)
(32722, 1)
(3872, 1)
(4725, 1)


In [50]:
# Define the model architecture using Keras Sequential API
class NNModel:
    '''
    Initizializes a 3 layer model with activation functions as 
    softmax at the final layer and relu for other layers.
    
    Uses Adam optimizer and cross entropy loss.
    '''
    def __init__(self, hidden_dim, feature_dim, output_size):
        if output_size <= 0 or output_size is None:
            raise Exception("output size is required to create the model.")
        if feature_dim <= 0 or feature_dim is None:
            raise Exception("feature dim mut be specified.")
        model = Sequential()
        for dim in hidden_dim:
            model.add(Dense(dim, input_dim=feature_dim, 
                            activation='relu',
                            kernel_initializer=RandomNormal(stddev=0.01),
                            kernel_regularizer=L2(0.5),
                           ))
    
        model.add(Dense(output_size, 
                        activation="softmax",
                        kernel_initializer=RandomNormal(stddev=0.01),
                        kernel_regularizer=L2(0.5),
                       ))
        self.model = model
        print(model.summary())
    
    def train(self, trainX, trainY, valX, valY, num_epochs=10, b_size=200, optim=None):
        model = self.model
        if optim is None:
            optim = Adam(learning_rate=0.2)
        model.compile(loss="binary_crossentropy", optimizer=optim, metrics=["accuracy"])
        history = model.fit(trainX, trainY, batch_size=b_size, epochs=num_epochs, verbose=1, validation_data=(valX, valY), shuffle=True, validation_batch_size=b_size)
        #history = model.fit(trainX, trainY, batch_size=b_size, epochs=num_epochs, verbose=1, shuffle=True, validation_batch_size=b_size)
        return history
    
    def test_batch(self, testX):
        '''
        gets predictions for a batch of results.
        '''
        model = self.model
        predictions = model.predict(testX)
        return predictions
    
    def test_single(self, testX):
        '''
        gets prediction for a single instance
        '''
        model = self.model
        predictions = model([np.array(testX).reshape(1,-1)], training=False).numpy()
        return predictions

In [51]:
b_size = 300
num_epochs = 50
model1 = NNModel([64, 32], train_X.shape[1], 1)
moldel1_H = model1.train(train_X, train_Y, val_X, val_Y, num_epochs=num_epochs, b_size=b_size)

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_46 (Dense)            (None, 64)                208064    
                                                                 
 dense_47 (Dense)            (None, 32)                2080      
                                                                 
 dense_48 (Dense)            (None, 1)                 33        
                                                                 
Total params: 210,177
Trainable params: 210,177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epo

In [56]:
def accuracy(y, y_hat):
    """
    Measure the accuracy of our model, print the results.
    Parameters:
    y (array): true labels
    y (array): model estimates
    Returns:
    None
    """
    count = 0
    for i in range(len(y)):
        guess = 1 if y_hat[i] > 0.5 else 0
        if guess == y[i]:
            count += 1
    print("Accuracy:", round((count / y.shape[0]) * 100, 2))

In [57]:
pred = model1.test_batch(test_X)
accuracy(test_Y, pred.reshape(test_Y.shape))

Accuracy: 41.89


In [58]:
inf_file = prepare_bbc_data.out_test_file
inf_df = pd.read_csv(inf_file)

out_headers = {'article': [],
              'summary': []}

#parent_dir = preprocessing_bbc.parent
#out_file = os.path.join(parent_dir, 'data', 'logr_results.csv')
out_file = 'NN_results.csv'

out_df = pd.DataFrame(out_headers)

inf_start = time.time()
for i in range(len(inf_df)):
    art_txt = inf_df.at[i, 'article'].strip()
    
    orig_sent, prepped_sent = create_inf_sents(art_txt)

    summary = []
    for j, sent in enumerate(prepped_sent):
        inf_X = featurize_X_from_text(sent, wv_from_text)
        pred = model1.test_single(inf_X)
        if pred[0] > 0.5:
            summary.append(orig_sent[j])
    out_df.at[i, 'article'] = art_txt
    out_df.at[i, 'summary'] = " ".join(summary)

out_df.to_csv(out_file, index=False)
print("inference completed for the test set\nTime taken: " + str(time.time()-inf_start))

inference completed for the test set
Time taken: 4.753982067108154
