In [33]:
import numpy as np
import os.path
from gensim.models import KeyedVectors
import time
import string
import train_embeddings_bbc
import preprocessing_bbc
import prepare_bbc_data
import pandas as pd
from sklearn.linear_model import LogisticRegression
from utils import create_inf_sents, featurize_X_from_text, featurize_embed_from_df

In [34]:
# load preprocessed files
train_file = preprocessing_bbc.cleaned_train_f
test_file = preprocessing_bbc.cleaned_test_f
val_file = preprocessing_bbc.cleaned_val_f

wv_from_text = train_embeddings_bbc.load_embeddings(train_embeddings_bbc.embedding_file)

train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
val_data = pd.read_csv(val_file)

In [35]:
EMBED_SIZE = train_embeddings_bbc.EMBEDDINGS_SIZE

MAX_WORD_SIZE = preprocessing_bbc.MAX_THRESH

In [36]:
# initialize features with zeros
train_X = np.zeros((len(train_data), (EMBED_SIZE * MAX_WORD_SIZE)))
test_X = np.zeros((len(test_data), (EMBED_SIZE * MAX_WORD_SIZE)))
val_X = np.zeros((len(val_data), (EMBED_SIZE * MAX_WORD_SIZE)))

train_Y = np.zeros((len(train_data), 1))
test_Y = np.zeros((len(test_data), 1))
val_Y = np.zeros((len(val_data), 1))

In [37]:
# fill the features based on the learned embeddings
featurize_embed_from_df(train_data, train_X, train_Y, wv_from_text)
featurize_embed_from_df(test_data, test_X, test_Y, wv_from_text)
featurize_embed_from_df(val_data, val_X, val_Y, wv_from_text)

In [38]:
# not using validation data for hyperparameter tuning for linear models
train_X = np.row_stack((train_X, val_X))
train_Y = np.row_stack((train_Y, val_Y))

print("Shape of train X:")
print(train_X.shape)
print("Shape of train Y:")
print(train_Y.shape)

print("Shape of test X:")
print(test_X.shape)
print("Shape of test Y:")
print(test_Y.shape)

Shape of train X:
(37447, 3250)
Shape of train Y:
(37447, 1)
Shape of test X:
(3872, 3250)
Shape of test Y:
(3872, 1)


In [39]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(train_X.tolist(), np.ravel(train_Y))
predictions = logisticRegr.predict(test_X.tolist())
score = logisticRegr.score(test_X.tolist(), test_Y.tolist())

In [40]:
print("\nOverall accuracy: " + str(round(score * 100, 2)) + " %")


Overall accuracy: 63.3 %


In [43]:
# create summaries for the test set using the trained model.

# original test data
inf_file = prepare_bbc_data.out_test_file
inf_df = pd.read_csv(inf_file)

out_headers = {'article': [],
               'original_summary': [],
               'model_summary': [],
              }

# summaries generated from the model will be written to this file.
out_file = 'logr_results.csv'
out_df = pd.DataFrame(out_headers)

inf_start = time.time()
for i in range(len(inf_df)):
    art_txt = inf_df.at[i, 'article'].strip()
    # from raw article text, create sentences. 
    # prepped sent has the preprocessed sentence while orig_sent has the actual sentence
    orig_sent, prepped_sent = create_inf_sents(art_txt)

    summary = []
    # loops to estimate if the sentence is a highlight or not 
    # If it is a highlight, appends the original sentence to the summary string.
    for j, sent in enumerate(prepped_sent):
        inf_X = featurize_X_from_text(sent, wv_from_text)
        pred = logisticRegr.predict(inf_X)
        if pred[0] == 1:
            summary.append(orig_sent[j])
    out_df.at[i, 'article'] = art_txt
    out_df.at[i, 'original_summary'] = inf_df.at[i, 'summary'].strip()
    if len(summary) == 0:
        out_df.at[i, 'model_summary'] = "__BLANK__"
    else:
        out_df.at[i, 'model_summary'] = " ".join(summary)

out_df.to_csv(out_file, index=False)
print("inference completed for the test set\nTime taken: " + str(time.time()-inf_start))

inference completed for the test set
Time taken: 1.0380022525787354
