In [1]:
import pandas as pd
import numpy as np
import nltk
import spacy
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from nltk import word_tokenize
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Activation, TimeDistributed, Dense, RepeatVector, LSTM, Flatten, Conv1D, Dropout, Conv2D

Using TensorFlow backend.


In [2]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
# get the data
author_text = []
paragraphs = []
for fileid in nltk.corpus.gutenberg.fileids():
    work = nltk.corpus.gutenberg.paras(fileid)
    for paragraph in work:
        author_text.append(fileid.split('.')[0])
        sentences = []
        for sentence in paragraph:
            sentences += sentence
        paragraphs.append(sentences)

In [0]:
print('Number of paragraphs: %d' %len(paragraphs))

Number of paragraphs: 47887


In [0]:
paragraphs[100]

['"',
 'Six',
 'years',
 'hence',
 '!',
 'Dear',
 'Miss',
 'Woodhouse',
 ',',
 'he',
 'would',
 'be',
 'thirty',
 'years',
 'old',
 '!"']

In [0]:
# concatenate list of words in to a sentence
paragraphs_concatenated = [' '.join(sentence) for sentence in paragraphs]

In [0]:
# simple preprocess by gensim (include lowercase, lemmatize, stem and tokenize, punctuation remove....)
paragraphs = [gensim.utils.simple_preprocess(sen) for sen in paragraphs_concatenated]

In [0]:
paragraphs[100]

['six',
 'years',
 'hence',
 'dear',
 'miss',
 'woodhouse',
 'he',
 'would',
 'be',
 'thirty',
 'years',
 'old']

In [0]:
print('Total number of tokens: %d'%np.sum([len(i) for i in paragraphs]))

Total number of tokens: 2053253


In [0]:
print('Average paragraph length: %d'%np.average([len(i) for i in paragraphs]))

Average paragraph length: 42


In [0]:
# describe on the length of paragraph
pd.Series([len(i) for i in paragraphs]).describe()

count    47887.000000
mean        42.877044
std        113.336532
min          0.000000
25%         16.000000
50%         27.000000
75%         47.000000
max       8783.000000
dtype: float64

In [0]:
# 95% quantile of length
pd.Series([len(i) for i in paragraphs]).quantile(0.95)

127.0

In [0]:
# pick length = 130
MAX_LEN = 130

In [0]:
# create Word2Vec object
# size: Dimensionality of the word vectors.
# window: Maximum distance between the current and predicted word within a sentence.
# min_count: Ignores all words with total frequency lower than this.
# sg: Training algorithm: 1 for skip-gram; otherwise CBOW.
# hs: If 1, hierarchical softmax will be used for model training. If 0, and `negative` is non-zero, negative sampling will be used.
# negative: If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
#           should be drawn (usually between 5-20).
#           If set to 0, no negative sampling is used.
# seed: Seed for the random number generator.
# compute_loss: If True, computes and stores loss value which can be retrieved using
#               :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`.
# callbacks: Sequence of callbacks to be executed at specific stages during training.

model = gensim.models.Word2Vec(paragraphs, size=150, window=5, min_count=1, seed=0, compute_loss=True)

In [9]:
model.train(paragraphs, total_examples=len(paragraphs), epochs=10)

(15186447, 20532530)

In [0]:
print('Latest training loss: %.2f'%model.get_latest_training_loss())

Latest training loss: 0.00


In [0]:
model.wv.most_similar('king', topn=6)

  if np.issubdtype(vec.dtype, np.int):


[('governor', 0.5235145688056946),
 ('esther', 0.5107651352882385),
 ('haman', 0.5050639510154724),
 ('jehoshaphat', 0.5002504587173462),
 ('david', 0.4888436198234558),
 ('solomon', 0.48661303520202637)]

In [0]:
paragraphs_embedding = []

for paragraph in paragraphs:
  paragraph_embedding = []
  for word in paragraph:
    word_embedding = model.wv.get_vector(word)
    paragraph_embedding.append(word_embedding)
  paragraphs_embedding.append(paragraph_embedding)

In [0]:
# trim and pad embeddign paragraph

paragraphs_embedding = sequence.pad_sequences(paragraphs_embedding, maxlen=MAX_LEN, padding='post', truncating='post', value=0.0)

In [0]:
# shape after padding & trimming
paragraphs_embedding.shape

(47887, 130, 150)

In [0]:
del paragraphs

In [0]:
# transform label data into vector
# factorize labels and keep a save record for later reference
factorize = pd.factorize(author_text)                                            
author_text = np_utils.to_categorical(factorize[0], len(nltk.corpus.gutenberg.fileids()))         

In [0]:
author_text.shape

(47887, 18)

In [0]:
# split train test
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(paragraphs_embedding, author_text, test_size=0.2, random_state=42)

In [0]:

# CONVOLUTIONAL NEURAL NETWORK

cnn = Sequential()
cnn.add(Conv1D(150, kernel_size=3, strides=1, padding='same', activation='relu', input_shape=(paragraphs_embedding.shape[1], paragraphs_embedding.shape[2])))
cnn.add(Conv1D(100, kernel_size=3, strides=1, padding='same', activation='relu'))
cnn.add(Flatten())
cnn.add(Dropout(0.5,seed=0))
cnn.add(Dense(150))
cnn.add(Dense(len(nltk.corpus.gutenberg.fileids())))
cnn.add(Activation('softmax'))

cnn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 130, 150)          67650     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 130, 100)          45100     
_________________________________________________________________
flatten_1 (Flatten)          (None, 13000)             0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 13000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 150)               1950150   
_________________________________________________________________
dense_2 (Dense)              (None, 18)                2718      
_________________________________________________________________
activation_1 (Activation)    (None, 18)                0         
Total para

In [0]:
cnn.fit(X_train, Y_train, batch_size = 128, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f9a3b132da0>

In [0]:
# predict on test dataset
predict_label_cnn = cnn.predict(X_test, batch_size = 128, verbose = 1)



In [0]:
from sklearn.metrics import classification_report
# turn the predict vector to final predict base on argmax
predict_label_cnn = [np.argmax(i) for i in predict_label_cnn]
# turn the test labels vector to final label base on argmax
Y_test_cnn = [np.argmax(i) for i in Y_test]
# classification report
print(classification_report(Y_test_cnn, predict_label_cnn))

              precision    recall  f1-score   support

           0       0.74      0.75      0.75       480
           1       0.83      0.49      0.61       211
           2       0.70      0.65      0.68       371
           3       0.94      0.99      0.97      4880
           4       0.32      0.11      0.16        66
           5       0.68      0.30      0.41       269
           6       0.98      0.62      0.76        68
           7       0.72      0.72      0.72       174
           8       0.83      0.50      0.63       330
           9       0.52      0.39      0.45       232
          10       0.50      0.59      0.54       232
          11       0.58      0.78      0.66       722
          12       0.73      0.68      0.70       550
          13       0.00      0.00      0.00         8
          14       0.69      0.49      0.57       165
          15       0.66      0.62      0.64       202
          16       0.27      0.39      0.32       117
          17       0.63    

  'precision', 'predicted', average, warn_for)


In [15]:
# STACKED LSTM

HIDDEN_SIZE = 256

stacked_lstm = Sequential()
stacked_lstm.add(LSTM(HIDDEN_SIZE, input_shape=(paragraphs_embedding.shape[1], paragraphs_embedding.shape[2])))
stacked_lstm.add(RepeatVector(MAX_LEN))
stacked_lstm.add(LSTM(HIDDEN_SIZE, return_sequences=True))
stacked_lstm.add(LSTM(HIDDEN_SIZE, return_sequences=True))

stacked_lstm.add(TimeDistributed(Dense(len(nltk.corpus.gutenberg.fileids()))))
stacked_lstm.add(Flatten())
stacked_lstm.add(Dense(len(nltk.corpus.gutenberg.fileids())))
stacked_lstm.add(Activation('softmax'))

stacked_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

stacked_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 256)               416768    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 130, 256)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 130, 256)          525312    
_________________________________________________________________
lstm_3 (LSTM)                (None, 130, 256)          525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 130, 18)           4626      
_________________________________________________________________
flatten_1 (Flatten)          (None, 2340)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 18)                42138     
__________

In [16]:
stacked_lstm.fit(X_train, Y_train, batch_size = 512, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fe6bea8ff60>

In [17]:
# predict on test dataset
predict_label_lstm = stacked_lstm.predict(X_test, batch_size = 512, verbose = 1)



In [18]:
from sklearn.metrics import classification_report
# turn the predict vector to final predict base on argmax
predict_label_lstm = [np.argmax(i) for i in predict_label_lstm]
# turn the test labels vector to final label base on argmax
Y_test_lstm = [np.argmax(i) for i in Y_test]
# classification report
print(classification_report(Y_test_lstm, predict_label_lstm))

              precision    recall  f1-score   support

           0       0.83      0.76      0.79       480
           1       0.92      0.60      0.72       211
           2       0.84      0.66      0.74       371
           3       0.96      0.99      0.97      4880
           4       0.20      0.06      0.09        66
           5       0.46      0.38      0.42       269
           6       0.84      0.68      0.75        68
           7       0.90      0.47      0.62       174
           8       0.80      0.48      0.60       330
           9       0.52      0.51      0.51       232
          10       0.52      0.59      0.55       232
          11       0.61      0.76      0.68       722
          12       0.71      0.65      0.68       550
          13       0.00      0.00      0.00         8
          14       0.34      0.72      0.46       165
          15       0.36      0.06      0.11       202
          16       0.27      0.05      0.09       117
          17       0.50    

  'precision', 'predicted', average, warn_for)
