In [59]:
import numpy as np
import os.path
from gensim.models import KeyedVectors
import time
import string
import train_embeddings_bbc
import preprocessing_bbc
import prepare_bbc_data
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.initializers import RandomNormal
from keras.regularizers import L2
from utils import create_inf_sents, featurize_X_from_text, featurize_embed_from_df

In [60]:
# load preprocessed files
train_file = preprocessing_bbc.cleaned_train_f
test_file = preprocessing_bbc.cleaned_test_f
val_file = preprocessing_bbc.cleaned_val_f

wv_from_text = train_embeddings_bbc.load_embeddings(train_embeddings_bbc.embedding_file)

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
val_data = pd.read_csv(val_file)

In [61]:
EMBED_SIZE = train_embeddings_bbc.EMBEDDINGS_SIZE

MAX_WORD_SIZE = preprocessing_bbc.MAX_THRESH

In [62]:
# initialize features with zeros
train_X = np.zeros((len(train_data), (EMBED_SIZE * MAX_WORD_SIZE)))
test_X = np.zeros((len(test_data), (EMBED_SIZE * MAX_WORD_SIZE)))
val_X = np.zeros((len(val_data), (EMBED_SIZE * MAX_WORD_SIZE)))

train_Y = np.zeros((len(train_data), 1))
test_Y = np.zeros((len(test_data), 1))
val_Y = np.zeros((len(val_data), 1))

In [63]:
# fill the features based on the learned embeddings
featurize_embed_from_df(train_data, train_X, train_Y, wv_from_text)
featurize_embed_from_df(test_data, test_X, test_Y, wv_from_text)
featurize_embed_from_df(val_data, val_X, val_Y, wv_from_text)

In [64]:
print("Shape of train X:")
print(train_X.shape)
print("Shape of train Y:")
print(train_Y.shape)

print("Shape of test X:")
print(test_X.shape)
print("Shape of test Y:")
print(test_Y.shape)

print("Shape of val X:")
print(test_X.shape)
print("Shape of val Y:")
print(test_Y.shape)

Shape of train X:
(32722, 3250)
Shape of train Y:
(32722, 1)
Shape of test X:
(3872, 3250)
Shape of test Y:
(3872, 1)
Shape of val X:
(3872, 3250)
Shape of val Y:
(3872, 1)


In [65]:
# Define the model architecture using Keras Sequential API
class NNModel:
    '''
    Initizializes a model with number of hidden layers as specified in the hidden_dim list
    with activation functions as softmax at the final layer and relu for other layers.
    
    Uses Adam optimizer and binary cross entropy loss.
    '''
    def __init__(self, hidden_dim, feature_dim):
        '''
        Parameters:
            hidden_dim (list): each item specified the number of nodes in each hidden layer
            feature_dim (int): number of features in a sample
        '''
        
        if feature_dim <= 0 or feature_dim is None:
            raise Exception("feature dim mut be specified.")
        output_size = 1
        model = Sequential()
        for dim in hidden_dim:
            model.add(Dense(dim, input_dim=feature_dim, 
                            activation='relu',
                            kernel_initializer=RandomNormal(stddev=0.01),
                            kernel_regularizer=L2(0.5),
                           ))
    
        model.add(Dense(output_size, 
                        activation="softmax",
                        kernel_initializer=RandomNormal(stddev=0.01),
                        kernel_regularizer=L2(0.5),
                       ))
        self.model = model
        print(model.summary())
    
    def train(self, trainX, trainY, valX, valY, num_epochs=10, b_size=200, optim=None):
        '''
        trains the model with the provided parameters
        '''
        model = self.model
        if optim is None:
            optim = Adam(learning_rate=0.2)
        model.compile(loss="binary_crossentropy", optimizer=optim, metrics=["accuracy"])
        history = model.fit(trainX, trainY, batch_size=b_size, epochs=num_epochs, verbose=1, validation_data=(valX, valY), shuffle=True, validation_batch_size=b_size)
        return history
    
    def test_batch(self, testX):
        '''
        gets predictions for a batch of results.
        '''
        model = self.model
        predictions = model.predict(testX)
        return predictions
    
    def test_single(self, testX):
        '''
        gets prediction for a single instance
        '''
        model = self.model
        predictions = model([np.array(testX).reshape(1,-1)], training=False).numpy()
        return predictions

In [66]:
b_size = 300
num_epochs = 50
model1 = NNModel([64, 32], train_X.shape[1])
moldel1_H = model1.train(train_X, train_Y, val_X, val_Y, num_epochs=num_epochs, b_size=b_size)

Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_49 (Dense)            (None, 64)                208064    
                                                                 
 dense_50 (Dense)            (None, 32)                2080      
                                                                 
 dense_51 (Dense)            (None, 1)                 33        
                                                                 
Total params: 210,177
Trainable params: 210,177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epo

In [68]:
def accuracy(y, y_hat):
    """
    Citation: taken from lecture notebook
    
    Measure the accuracy of our model, print the results.
    Parameters:
        y (array): true labels
        y_hat (array): model estimates
    Returns:
        None
    """
    count = 0
    for i in range(len(y)):
        guess = 1 if y_hat[i] > 0.5 else 0
        if guess == y[i]:
            count += 1
    print("Accuracy:", round((count / y.shape[0]) * 100, 2))

In [69]:
pred = model1.test_batch(test_X)
accuracy(test_Y, pred.reshape(test_Y.shape))

Accuracy: 41.89


In [70]:
# create summaries for the test set using the trained model.
# original test data

inf_file = prepare_bbc_data.out_test_file
inf_df = pd.read_csv(inf_file)

out_headers = {'article': [],
               'original_summary': [],
               'model_summary': [],
              }

# summaries generated from the model will be written to this file.
out_file = 'NN_results.csv'

out_df = pd.DataFrame(out_headers)

inf_start = time.time()
for i in range(len(inf_df)):
    art_txt = inf_df.at[i, 'article'].strip()
    # from raw article text, create sentences. 
    # prepped sent has the preprocessed sentence while orig_sent has the actual sentence
    orig_sent, prepped_sent = create_inf_sents(art_txt)

    summary = []
    # loops to estimate if the sentence is a highlight or not 
    # If it is a highlight, appends the original sentence to the summary string.
    for j, sent in enumerate(prepped_sent):
        inf_X = featurize_X_from_text(sent, wv_from_text)
        pred = model1.test_single(inf_X)
        if pred[0] > 0.5:
            summary.append(orig_sent[j])
    out_df.at[i, 'article'] = art_txt
    out_df.at[i, 'original_summary'] = inf_df.at[i, 'summary'].strip()
    if len(summary) == 0:
        out_df.at[i, 'model_summary'] = "__BLANK__"
    else:
        out_df.at[i, 'model_summary'] = " ".join(summary)

out_df.to_csv(out_file, index=False)
print("inference completed for the test set\nTime taken: " + str(time.time()-inf_start))

inference completed for the test set
Time taken: 5.384002447128296
