In [1]:
#utils check 
import utils_updated
from rnn import processText

#import packages
import string
# import glove
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
import warnings
warnings.filterwarnings("ignore")
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

# word2vec
from gensim.models import Word2Vec

# # RNN
# from keras.callbacks import LambdaCallback
# from keras.layers.recurrent import LSTM
# from keras.layers.embeddings import Embedding
# from keras.layers import Dense, Activation
# from keras.models import Sequential

from gensim.summarization import summarize

In [2]:
#load dataset
data = pd.read_csv('../data/papers.csv')
#preprocessing
dataNew = utils_updated.preprocessing(data, dropnan = True)
#check data## I. Data Preprocessing
dataNew.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text,aLen,tLen
0,1,1987,self-organization of associative database and ...,,1-self-organization-of-associative-database-an...,an efficient method of self-organizing associa...,767 self-organization of associative database...,73,7
1,10,1987,a mean field theory of layer iv of visual cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,a single cell theory for the development of se...,683 a mean field theory of layer iv of visual...,91,17
2,100,1988,storing covariance by the associative long-ter...,,100-storing-covariance-by-the-associative-long...,in modeling studies or memory based on neural...,394 storing covariance by the associative lon...,300,15
3,1000,1994,bayesian query construction for neural network...,,1000-bayesian-query-construction-for-neural-ne...,"if data collection is costly, there is much to...",bayesian query construction for neural network...,94,7
4,1001,1994,"neural network ensembles, cross validation, an...",,1001-neural-network-ensembles-cross-validation...,learning of continuous valued functions using ...,"neural network ensembles, cross validation, an...",129,8


In [3]:
#for now we use articles with a length less than 250
data250 = dataNew[dataNew.aLen <= 250]
data250.shape

(4638, 9)

In [4]:
#tokenize data
prep = processText(data250[['title', 'abstract']].values.T)
#get dictionaries of word and tags
prep.getDictionary()
#update sequence length
prep.updateMaxLen()
print('Number of unique words: ', prep.nUnique)
print('Maxmimum sequence length: ', prep.maxLen)

Number of unique words:  32468
Maxmimum sequence length:  250


In [5]:
#get tokenized vector of text
txtTokenized = prep.tokenize()
titles = txtTokenized[0]
abstracts = txtTokenized[1]
print('Example of tokenized title:\n {0} => {1}'.format(titles[0], [prep.idx2word[i] for i in titles[0]]))
print('Example of tokenized abstract:\n {0} => {1}'.format(abstracts[0],[prep.idx2word[i] for i in abstracts[0]]))

Example of tokenized title:
 [3, 4, 5, 6, 7, 8, 9] => ['self-organization', 'of', 'associative', 'database', 'and', 'its', 'applications']
Example of tokenized abstract:
 [42, 466, 64, 4, 580, 5, 5497, 431, 5498, 5499, 51, 9, 19, 321, 5500, 5501, 58, 5498, 5497, 176, 5502, 3251, 503, 51, 309, 5503, 75, 58, 619, 5504, 1743, 4, 5505, 42, 61, 4, 3, 431, 5506, 368, 42, 1019, 4, 5507, 1727, 5508, 10, 289, 4072, 4, 21, 5509, 75, 58, 5510, 5504, 5511, 42, 5512, 19, 187, 1181, 92, 7, 122, 19, 42, 319, 320, 321, 159, 1391, 5513] => ['an', 'efficient', 'method', 'of', 'self-organizing', 'associative', 'databases', 'is', 'proposed', 'together', 'with', 'applications', 'to', 'robot', 'eyesight', 'systems.', 'the', 'proposed', 'databases', 'can', 'associate', 'any', 'input', 'with', 'some', 'output.', 'in', 'the', 'first', 'half', 'part', 'of', 'discussion,', 'an', 'algorithm', 'of', 'self-organization', 'is', 'proposed.', 'from', 'an', 'aspect', 'of', 'hardware,', 'it', 'produces', 'a', 'new', 'st

In [6]:
#split data into train, validation, and test set
trainX, testX, trainY, testY = train_test_split(abstracts, titles, test_size = 0.2 , random_state = 209)
trainX, valX, trainY, valY = train_test_split(trainX, trainY, test_size = 0.1 , random_state = 209)

print('Number of training samples: ', len(trainX))
print('Number of validation samples: ', len(valX))
print('Number of test samples: ', len(testX))

Number of training samples:  3339
Number of validation samples:  371
Number of test samples:  928


## GENSIM - Summarization 

In [10]:
# Create a nested list of tokenized word in each abstract in the training set
X_train_list_word = []
for i in range(len(trainX)):
    X_train_list_word.append([prep.idx2word[word] for word in trainX[i]])

In [63]:
# Create a nested list of tokenized word in each abstract in the training set
X_test_list_word = []
for i in range(len(testX)):
    X_test_list_word.append([prep.idx2word[`word] for word in testX[i]])

In [69]:
# Use gensim.summarize to generate "summary" of abstracts as a proxy of title
gen_summary = []
for i, abstract in enumerate(X_train_list_word):
    print(i)
    temp_abs = ' '.join(abstract)
    word_count = 1
    
    try:
        temp_sum = summarize(temp_abs, ratio=None, word_count = word_count)

    except:
        temp_sum = []
        gen_summary.append(temp_sum)
        continue
        
    while temp_sum == '':
        word_count += 1 # keep adding "word_count" until a summary is produced
        temp_sum = summarize(temp_abs, ratio=None, word_count = word_count)
        if word_count >= len(temp_abs):
            temp_sum = []
            gen_summary.append(temp_sum)
            continue
    
    gen_summary.append(temp_sum)
        
## Test-set
gen_summary_test = []
for i, abstract in enumerate(X_test_list_word):
    print(i)
    temp_abs = ' '.join(abstract)
    word_count = 1
    
    try:
        temp_sum = summarize(temp_abs, ratio=None, word_count = word_count)

    except:
        temp_sum = []
        gen_summary.append(temp_sum)
        continue
        
    while temp_sum == '':
        word_count += 1
        temp_sum = summarize(temp_abs, ratio=None, word_count = word_count)
        if word_count >= len(temp_abs):
            temp_sum = []
            gen_summary.append(temp_sum)
            continue
    
    gen_summary_test.append(temp_sum)
        

In [62]:
import pickle
pickle_path = 'baseline-gensim/gen_training'
with open(pickle_path,'wb') as fp:
    pickle.dump(gen_summary, fp)

pickle_path = 'baseline-gensim/gen_test'
with open(pickle_path,'wb') as fp:
    pickle.dump(gen_summary_test, fp)

In [68]:
np.sum(list(map(lambda x: x == [], gen_summary)))/len(gen_summary)

0.07132585562266322

In [65]:
np.sum(list(map(lambda x: x == [], gen_summary_test)))/len(gen_summary_test)

0.0390032502708559