In [1]:
import numpy as np
import os.path
from gensim.models import KeyedVectors
import time
import string
import train_embeddings_bbc
import preprocessing_bbc
import prepare_bbc_data
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.initializers import RandomNormal
from keras.regularizers import L2
from utils import create_inf_sents, featurize_X_from_text, featurize_embed_from_df

In [2]:
# load preprocessed files
train_file = preprocessing_bbc.cleaned_train_f
test_file = preprocessing_bbc.cleaned_test_f
val_file = preprocessing_bbc.cleaned_val_f

wv_from_text = train_embeddings_bbc.load_embeddings(train_embeddings_bbc.embedding_file)

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
val_data = pd.read_csv(val_file)

In [3]:
EMBED_SIZE = train_embeddings_bbc.EMBEDDINGS_SIZE

MAX_WORD_SIZE = preprocessing_bbc.MAX_THRESH

In [4]:
# initialize features with zeros
train_X = np.zeros((len(train_data), (EMBED_SIZE * MAX_WORD_SIZE)))
test_X = np.zeros((len(test_data), (EMBED_SIZE * MAX_WORD_SIZE)))
val_X = np.zeros((len(val_data), (EMBED_SIZE * MAX_WORD_SIZE)))

train_Y = np.zeros((len(train_data), 1))
test_Y = np.zeros((len(test_data), 1))
val_Y = np.zeros((len(val_data), 1))

In [5]:
# fill the features based on the learned embeddings
featurize_embed_from_df(train_data, train_X, train_Y, wv_from_text)
featurize_embed_from_df(test_data, test_X, test_Y, wv_from_text)
featurize_embed_from_df(val_data, val_X, val_Y, wv_from_text)

In [6]:
print("Shape of train X:")
print(train_X.shape)
print("Shape of train Y:")
print(train_Y.shape)

print("Shape of test X:")
print(test_X.shape)
print("Shape of test Y:")
print(test_Y.shape)

print("Shape of val X:")
print(val_X.shape)
print("Shape of val Y:")
print(val_Y.shape)

Shape of train X:
(32722, 3250)
Shape of train Y:
(32722, 1)
Shape of test X:
(3872, 3250)
Shape of test Y:
(3872, 1)
Shape of val X:
(4725, 3250)
Shape of val Y:
(4725, 1)


In [7]:
# Define the model architecture using Keras Sequential API
class NNModel:
    '''
    Initizializes a model with number of hidden layers as specified in the hidden_dim list
    with activation functions as softmax at the final layer and relu for other layers.
    
    Uses Adam optimizer and binary cross entropy loss.
    '''
    def __init__(self, hidden_dim, feature_dim):
        '''
        Parameters:
            hidden_dim (list): each item specified the number of nodes in each hidden layer
            feature_dim (int): number of features in a sample
        '''
        
        if feature_dim <= 0 or feature_dim is None:
            raise Exception("feature dim mut be specified.")
        output_size = 1
        model = Sequential()
        for dim in hidden_dim:
            model.add(Dense(dim, input_dim=feature_dim, 
                            activation='relu',
                            kernel_initializer=RandomNormal(stddev=0.01),
                            kernel_regularizer=L2(0.5),
                           ))
    
        model.add(Dense(output_size, 
                        activation="softmax",
                        kernel_initializer=RandomNormal(stddev=0.01),
                        kernel_regularizer=L2(0.5),
                       ))
        self.model = model
        print(model.summary())
    
    def train(self, trainX, trainY, valX, valY, num_epochs=10, b_size=200, optim=None):
        '''
        trains the model with the provided parameters
        '''
        model = self.model
        if optim is None:
            optim = Adam(learning_rate=0.2)
        model.compile(loss="binary_crossentropy", optimizer=optim, metrics=["accuracy"])
        history = model.fit(trainX, trainY, batch_size=b_size, epochs=num_epochs, verbose=1, validation_data=(valX, valY), shuffle=True, validation_batch_size=b_size)
        return history
    
    def test_batch(self, testX):
        '''
        gets predictions for a batch of results.
        '''
        model = self.model
        predictions = model.predict(testX)
        return predictions
    
    def test_single(self, testX):
        '''
        gets prediction for a single instance
        '''
        model = self.model
        predictions = model([np.array(testX).reshape(1,-1)], training=False).numpy()
        return predictions

In [8]:
# variables to tune hyper parameters
best_model = None
best_val_avg_acc = -1
best_val_avg_loss = 999999999

In [9]:
# hyper parameters 1
b_size = 300
num_epochs = 50
model1 = NNModel([64, 32], train_X.shape[1])
moldel1_H = model1.train(train_X, train_Y, val_X, val_Y, num_epochs=num_epochs, b_size=b_size)

avg_acc = sum(moldel1_H.history['val_accuracy']) / len(moldel1_H.history['val_accuracy'])
avg_loss = sum(moldel1_H.history['val_loss']) / len(moldel1_H.history['val_loss'])

if avg_acc > best_val_avg_acc:
    print("selected the trained model as best model")
    print("average val accuracy: " + str(avg_acc))
    print("average val loss: " + str(avg_loss))
    best_model = model1
    best_val_avg_acc = avg_acc
    best_val_avg_loss = avg_loss
elif avg_acc == best_val_avg_acc:
    if avg_loss < best_val_avg_loss:
        print("selected the trained model as best model")
        print("average val accuracy: " + str(avg_acc))
        print("average val loss: " + str(avg_loss))
        best_model = model1
        best_val_avg_acc = avg_acc
        best_val_avg_loss = avg_loss

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                208064    
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 1)                 33        
                                                                 
Total params: 210,177
Trainable params: 210,177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 

In [10]:
# hyper parameters 2
b_size = 100
num_epochs = 100
model1 = NNModel([128, 32], train_X.shape[1])
moldel1_H = model1.train(train_X, train_Y, val_X, val_Y, num_epochs=num_epochs, b_size=b_size)

avg_acc = sum(moldel1_H.history['val_accuracy']) / len(moldel1_H.history['val_accuracy'])
avg_loss = sum(moldel1_H.history['val_loss']) / len(moldel1_H.history['val_loss'])

if avg_acc > best_val_avg_acc:
    print("selected the trained model as best model")
    print("average val accuracy: " + str(avg_acc))
    print("average val loss: " + str(avg_loss))
    best_model = model1
    best_val_avg_acc = avg_acc
    best_val_avg_loss = avg_loss
elif avg_acc == best_val_avg_acc:
    if avg_loss < best_val_avg_loss:
        print("selected the trained model as best model")
        print("average val accuracy: " + str(avg_acc))
        print("average val loss: " + str(avg_loss))
        best_model = model1
        best_val_avg_acc = avg_acc
        best_val_avg_loss = avg_loss

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 128)               416128    
                                                                 
 dense_4 (Dense)             (None, 32)                4128      
                                                                 
 dense_5 (Dense)             (None, 1)                 33        
                                                                 
Total params: 420,289
Trainable params: 420,289
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoc

Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [11]:
# hyper parameters 3
b_size = 300
num_epochs = 50
model1 = NNModel([64, 32], train_X.shape[1])
moldel1_H = model1.train(train_X, train_Y, val_X, val_Y, num_epochs=num_epochs, b_size=b_size, optim=Adam(learning_rate=0.01))

avg_acc = sum(moldel1_H.history['val_accuracy']) / len(moldel1_H.history['val_accuracy'])
avg_loss = sum(moldel1_H.history['val_loss']) / len(moldel1_H.history['val_loss'])

if avg_acc > best_val_avg_acc:
    print("selected the trained model as best model")
    print("average val accuracy: " + str(avg_acc))
    print("average val loss: " + str(avg_loss))
    best_model = model1
    best_val_avg_acc = avg_acc
    best_val_avg_loss = avg_loss
elif avg_acc == best_val_avg_acc:
    if avg_loss < best_val_avg_loss:
        print("selected the trained model as best model")
        print("average val accuracy: " + str(avg_acc))
        print("average val loss: " + str(avg_loss))
        best_model = model1
        best_val_avg_acc = avg_acc
        best_val_avg_loss = avg_loss

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 64)                208064    
                                                                 
 dense_7 (Dense)             (None, 32)                2080      
                                                                 
 dense_8 (Dense)             (None, 1)                 33        
                                                                 
Total params: 210,177
Trainable params: 210,177
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoc

In [12]:
def accuracy(y, y_hat):
    """
    Citation: taken from lecture notebook
    
    Measure the accuracy of our model, print the results.
    Parameters:
        y (array): true labels
        y_hat (array): model estimates
    Returns:
        None
    """
    count = 0
    for i in range(len(y)):
        guess = 1 if y_hat[i] > 0.5 else 0
        if guess == y[i]:
            count += 1
    print("Accuracy:", round((count / y.shape[0]) * 100, 2))

In [13]:
pred = best_model.test_batch(test_X)
accuracy(test_Y, pred.reshape(test_Y.shape))

Accuracy: 41.89


In [14]:
# create summaries for the test set using the trained model.
# original test data

inf_file = prepare_bbc_data.out_test_file
inf_df = pd.read_csv(inf_file)

out_headers = {'article': [],
               'original_summary': [],
               'model_summary': [],
              }

# summaries generated from the model will be written to this file.
out_file = 'NN_results.csv'

out_df = pd.DataFrame(out_headers)

inf_start = time.time()
for i in range(len(inf_df)):
    art_txt = inf_df.at[i, 'article'].strip()
    # from raw article text, create sentences. 
    # prepped sent has the preprocessed sentence while orig_sent has the actual sentence
    orig_sent, prepped_sent = create_inf_sents(art_txt)

    summary = []
    # loops to estimate if the sentence is a highlight or not 
    # If it is a highlight, appends the original sentence to the summary string.
    for j, sent in enumerate(prepped_sent):
        inf_X = featurize_X_from_text(sent, wv_from_text)
        pred = best_model.test_single(inf_X)
        if pred[0] > 0.5:
            summary.append(orig_sent[j])
    out_df.at[i, 'article'] = art_txt
    out_df.at[i, 'original_summary'] = inf_df.at[i, 'summary'].strip()
    if len(summary) == 0:
        out_df.at[i, 'model_summary'] = "__BLANK__"
    else:
        out_df.at[i, 'model_summary'] = " ".join(summary)

out_df.to_csv(out_file, index=False)
print("inference completed for the test set\nTime taken: " + str(time.time()-inf_start))

inference completed for the test set
Time taken: 4.929999589920044
