In [1]:
!wget --no-check-certificate \
    https://storage.googleapis.com/protonx-cloud-storage/data.txt
data = open('data.txt').read()

--2021-01-14 10:54:21--  https://storage.googleapis.com/protonx-cloud-storage/data.txt
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.20.128, 74.125.195.128, 74.125.142.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.20.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93578 (91K) [text/plain]
Saving to: ‘data.txt’


2021-01-14 10:54:21 (114 MB/s) - ‘data.txt’ saved [93578/93578]



In [2]:
corpus = data.lower().split("\n")

## Yêu cầu: Sinh từ tiếp khi gõ bất cứ một câu nào có chiều dài bất kỳ.

In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

train_sequences = []
train_labels = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        train_sequences.append(token_list[:i])
        train_labels.append(token_list[i])

### 1. Xử lý dữ liệu. Chia features, label

In [6]:
max_sequence_len = max([len(line) for line in train_sequences])
padded_train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_len, padding='pre')
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=total_words)

### 2. Xây dựng model
#### Yêu cầu độ chính xác: 80%

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, Dropout, LSTM, Dense

In [47]:
# Model
model = Sequential()

# Một lớp Embedding
model.add(Embedding(total_words, 100, input_length=max_sequence_len))

# Một lớp Bidirectional với cell là LSTM
model.add(Bidirectional(LSTM(150, return_sequences=True)))

# Một lớp Dropout
model.add(Dropout(0.2))

# Một lớp LSTM
model.add(LSTM(100))

# 2 lớp Dense nhận diện từ tiếp theo
model.add(Dense(total_words//2, activation='relu', kernel_regularizer='l2'))
model.add(Dense(total_words, activation='softmax'))

# Compile
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [48]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 10, 100)           321100    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 10, 300)           301200    
_________________________________________________________________
dropout_7 (Dropout)          (None, 10, 300)           0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 100)               160400    
_________________________________________________________________
dense_14 (Dense)             (None, 1605)              162105    
_________________________________________________________________
dense_15 (Dense)             (None, 3211)              5156866   
Total params: 6,101,671
Trainable params: 6,101,671
Non-trainable params: 0
____________________________________________

In [49]:
history = model.fit(padded_train_sequences, train_labels, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### 3. Dự đoán 10 từ tiếp theo

In [50]:
test_seq = 'despite of wrinkles'
# sinh ra 10 từ tiếp theo sau test_seq
# despite of wrinkles this thy golden time to heart's sight ' must '

num_words = 10
for i in range(num_words):
    token_list = pad_sequences([tokenizer.texts_to_sequences([test_seq])[0]], maxlen=max_sequence_len, padding='pre')
    predicted = model.predict_classes(token_list, verbose=0)
    output_word = tokenizer.index_word[predicted[0]]
    test_seq += ' ' + output_word
print(test_seq)



despite of wrinkles this thy golden time or ever did place old lie
