In [1]:
import numpy as np
import os.path
from gensim.models import KeyedVectors
import time
from gensim.models import Word2Vec
import string
import train_embeddings_bbc
import preprocessing_bbc
import prepare_bbc_data
import pandas as pd
from sklearn import svm
from utils import create_inf_sents, featurize_X_from_text, featurize_embed_from_df

In [2]:
train_file = preprocessing_bbc.cleaned_train_f
test_file = preprocessing_bbc.cleaned_test_f
val_file = preprocessing_bbc.cleaned_val_f

wv_model = train_embeddings_bbc.load_gensim(train_embeddings_bbc.model_file)
wv_from_text = train_embeddings_bbc.load_embeddings(train_embeddings_bbc.embedding_file)

In [3]:
train_data = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
val_data = pd.read_csv(val_file)

In [4]:
EMBED_SIZE = train_embeddings_bbc.EMBEDDINGS_SIZE

In [5]:
MAX_WORD_SIZE = preprocessing_bbc.MAX_THRESH

In [6]:
train_X = np.zeros((len(train_data), (EMBED_SIZE * MAX_WORD_SIZE)))
test_X = np.zeros((len(test_data), (EMBED_SIZE * MAX_WORD_SIZE)))
val_X = np.zeros((len(val_data), (EMBED_SIZE * MAX_WORD_SIZE)))

train_Y = np.zeros((len(train_data), 1))
test_Y = np.zeros((len(test_data), 1))
val_Y = np.zeros((len(val_data), 1))

In [7]:
print(train_X.shape)
print(test_X.shape)
print(val_X.shape)

(32722, 3250)
(3872, 3250)
(4725, 3250)


In [8]:
featurize_embed_from_df(train_data, train_X, train_Y, wv_from_text)
featurize_embed_from_df(test_data, test_X, test_Y, wv_from_text)
featurize_embed_from_df(val_data, val_X, val_Y, wv_from_text)

In [9]:
print(train_X.shape)
print(test_X.shape)
print(val_X.shape)

print(train_Y.shape)
print(test_Y.shape)
print(val_Y.shape)

(32722, 3250)
(3872, 3250)
(4725, 3250)
(32722, 1)
(3872, 1)
(4725, 1)


In [10]:
train_start = time.time()
model = svm.SVC(kernel='poly', max_iter=10000)
model.fit(train_X.tolist(), np.ravel(train_Y))
train_end = time.time()
print("Training completed\nTime taken: " + str(train_end-train_start))
predictions = model.predict(test_X.tolist())
print("Predictions done\nTime taken: " + str(time.time()- train_end))
score = model.score(test_X.tolist(), test_Y.tolist())



Training completed
Time taken: 687.4806685447693
Predictions done
Time taken: 74.06600022315979


In [11]:
print("\nOverall accuracy: " + str(round(score * 100, 2)) + " %")


Overall accuracy: 66.63 %


In [14]:
inf_file = prepare_bbc_data.out_test_file
inf_df = pd.read_csv(inf_file)

out_headers = {'article': [],
              'summary': []}

#parent_dir = preprocessing_bbc.parent
#out_file = os.path.join(parent_dir, 'data', 'logr_results.csv')
out_file = 'svm_results.csv'

out_df = pd.DataFrame(out_headers)

inf_start = time.time()
for i in range(len(inf_df)):
    art_txt = inf_df.at[i, 'article'].strip()
    
    orig_sent, prepped_sent = create_inf_sents(art_txt)

    summary = []
    for j, sent in enumerate(prepped_sent):
        inf_X = featurize_X_from_text(sent, wv_from_text)
        pred = model.predict(inf_X)
        if pred[0] == 1:
            summary.append(orig_sent[j])
        out_df.at[i, 'article'] = art_txt
        out_df.at[i, 'summary'] = " ".join(summary)

out_df.to_csv(out_file, index=False)
print("inference completed for the test set\nTime taken: " + str(time.time()-inf_start))

inference completed for the test set
Time taken: 76.61219120025635
