In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

%matplotlib inline

import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

import time
from numba import jit, njit

from pprint import pprint
from rouge import Rouge

from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, Bidirectional
from keras.optimizers import Adam, Nadam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [25]:
import tensorflow
np.__version__

'1.20.2'

In [2]:
df = pd.read_csv('../data/wikihowAll_cleaned.csv', delimiter=',', nrows=1000)

In [3]:
df.head()

Unnamed: 0,headline,text
0,keep related supplies in the same area make an...,photographer keep necessary lens cord battery ...
1,create a sketch in the neopoprealist manner of...,see image draw develops stepbystep however imp...
2,get a bachelors degree enroll in a studiobased...,possible become vfx artist without college deg...
3,start with some experience or interest in art ...,best art investor research piece art buy someo...
4,keep your reference materials sketches article...,start planning project work shall likely gathe...


Розділимо датасет на тренувальний, тестувальний і валідаційний

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train, test = train_test_split(df, test_size=0.15, random_state=42, shuffle=True)

In [6]:
train, val = train_test_split(train, test_size=0.1765, random_state=42, shuffle=True)

In [7]:
train.shape

(699, 2)

In [8]:
test.shape

(150, 2)

In [9]:
val.shape

(151, 2)

In [10]:
def calc_vocab(texts) :
    vocab = [] 
    for text in texts:
        words = text.split()
        for w in words:          
            if w not in vocab:
                vocab.append(w)
    return len(vocab)

In [11]:
vocab = calc_vocab(train['text'])

In [12]:
vocab

10881

In [13]:
maxlen = 400
embed_dim = 220
batch_size = 32
latent_dim = 200

In [14]:
tokenizer = Tokenizer(num_words=vocab)
tokenizer.fit_on_texts(train['text'])
seqs = tokenizer.texts_to_sequences(train['text'])
pad_seqs = pad_sequences(seqs, maxlen)

seqs_val = tokenizer.texts_to_sequences(val['text'])
pad_seqs_val = pad_sequences(seqs_val, maxlen)

In [15]:
encoder_inputs = Input(shape=(maxlen,))
emb_layer = Embedding(vocab, embed_dim, input_length = maxlen)
x = emb_layer(encoder_inputs)
encoder_lstm_1 = LSTM(latent_dim, activation='relu', return_sequences=True, dropout=0.2, recurrent_dropout=0.1)(x)
encoder_lstm_2 = LSTM(latent_dim, activation='relu', return_sequences=True, dropout=0.3)(encoder_lstm_1)
state_h = LSTM(latent_dim, activation='relu')(encoder_lstm_2)
encoder_model = Model(inputs=encoder_inputs, outputs=state_h)
seq2seq_encoder_out = encoder_model(encoder_inputs)

In [16]:
decoded = RepeatVector(maxlen)(seq2seq_encoder_out)
decoder_lstm = LSTM(latent_dim, return_sequences=True)
decoder_lstm_output = decoder_lstm(decoded)
decoder_dense = Dense(vocab, activation='softmax')#TimeDistributed
decoder_outputs = decoder_dense(decoder_lstm_output)

In [17]:
seq2seq_Model = Model(encoder_inputs, decoder_outputs)
seq2seq_Model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 400)               0         
_________________________________________________________________
model_1 (Model)              (None, 200)               3372220   
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 400, 200)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 400, 200)          320800    
_________________________________________________________________
dense_1 (Dense)              (None, 400, 10881)        2187081   
Total params: 5,880,101
Trainable params: 5,880,101
Non-trainable params: 0
_________________________________________________________________


In [18]:
seq2seq_Model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

es = EarlyStopping(monitor = 'val_loss',
                   mode = 'min',
                   verbose = 1,
                   patience = 2)

mc = ModelCheckpoint('best_model.h5',
                    monitor = 'val_loss',
                    mode = 'min',
                    verbose = 1,
                    save_best_only = True)

In [19]:
history = seq2seq_Model.fit(pad_seqs, np.expand_dims(pad_seqs, -1), 
                            batch_size=32, 
                            callbacks = [es, mc],
                            epochs=15,
                            validation_data = (pad_seqs_val, np.expand_dims(pad_seqs_val, -1)))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 699 samples, validate on 151 samples
Epoch 1/15
 32/699 [>.............................] - ETA: 12:09 - loss: 9.2946

KeyboardInterrupt: 

In [None]:
from matplotlib import pyplot
pyplot.figure(figsize = (11,8))
pyplot.plot(history.history['loss'], label = 'train', color = 'blue')
pyplot.plot(history.history['val_loss'], label = 'test', color = 'orange')

pyplot.legend()
pyplot.show()

In [None]:
def get_scores_rouge(text, prediction):
    rouge = Rouge()
    scores = rouge.get_scores(prediction, text)
    return scores

In [None]:
def get_predicted_text(text, maxlen=maxlen):
    seqs = tokenizer.texts_to_sequences([text])
    pad_seqs = pad_sequences(seqs, maxlen)
    res = seq2seq_Model.predict(pad_seqs)
    ty = tokenizer.sequences_to_texts(pad_seqs)
    rr = ' '.join(ty)
    return rr

In [None]:
pred = get_predicted_text(list(test['text'])[0])

In [None]:
pred

In [None]:
list(test['text'])[0]

In [None]:
pprint(get_scores_rouge(list(test['text'])[0], pred))

In [None]:
pred = get_predicted_text(list(train['text'])[0])

In [None]:
pred

In [None]:
list(train['text'])[0]

In [None]:
pprint(get_scores_rouge(list(train['text'])[0], pred))

In [None]:
def rouge_min(texts, rouge, metric):
    min_s = np.inf
    for text in texts:
        score = get_scores_rouge(text, get_predicted_text(text))[0].get(rouge).get(metric)
        if score < min_s:
            min_s = score
    return min_s

In [None]:
rouge_min(list(test['text']), 'rouge-l', 'p')

In [None]:
rouge_min(list(test['text']), 'rouge-l', 'r')

In [None]:
rouge_min(list(test['text']), 'rouge-l', 'f')