In [71]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import numpy as np
import joblib as jb
import re

In [14]:
with open('deneme.txt', 'r', encoding = 'utf8') as file:
    content = [line for line in file.readlines()]
    
content[:5]

['The Project Gutenberg eBook of Pride and Prejudice\n',
 '    \n',
 'This ebook is for the use of anyone anywhere in the United States and\n',
 'most other parts of the world at no cost and with almost no restrictions\n',
 'whatsoever. You may copy it, give it away or re-use it under the terms\n']

In [26]:
content_prepare = [line.strip() for line in content if line.strip()]
content_prepare[:10]

['The Project Gutenberg eBook of Pride and Prejudice',
 'This ebook is for the use of anyone anywhere in the United States and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. You may copy it, give it away or re-use it under the terms',
 'of the Project Gutenberg License included with this ebook or online',
 'at www.gutenberg.org. If you are not located in the United States,',
 'you will have to check the laws of the country where you are located',
 'before using this eBook.',
 'Title: Pride and Prejudice',
 'Author: Jane Austen']

In [27]:
content_prepare = ' '.join(content_prepare)
content_prepare[:600]

'The Project Gutenberg eBook of Pride and Prejudice This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this ebook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: Pride and Prejudice Author: Jane Austen Release date: June 1, 1998 [eBook #1342] Most recently upda'

In [28]:
def preprocess(text):
    # Remove URLs (starting with 'www.' or 'http' or 'https')
    text = re.sub(r'http\S+|www\.\S+', '', text)
    # Remove digits, special characters, quotes, double quotes, carriage returns, and newlines
    text_cleaned = re.sub(r'[^\w\s]|\r|\n|\d|["\']', '', text)
    # Replace multiple spaces with a single space
    text_cleaned = re.sub(r'[\s]+', ' ', text_cleaned)
    return text_cleaned

content_prepare = preprocess(content_prepare)
content_prepare[:600]

'The Project Gutenberg eBook of Pride and Prejudice This ebook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever You may copy it give it away or reuse it under the terms of the Project Gutenberg License included with this ebook or online at If you are not located in the United States you will have to check the laws of the country where you are located before using this eBook Title Pride and Prejudice Author Jane Austen Release date June eBook Most recently updated June Language English Credits Chuck Gre'

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([content_prepare])

data_sequences = tokenizer.texts_to_sequences([content_prepare])[0]

In [31]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7451


In [67]:
sequences = []
sequence_length = 5
for i in range(sequence_length, len(data_sequences)):
    words = data_sequences[i - sequence_length: i + 1]
    sequences.append(words)

sequences = np.array(sequences)



x = []
y = []
for sequence in sequences:
    x.append(sequence[0:sequence_length])
    y.append(sequence[sequence_length])
    
x = np.array(x)
y = np.array(y)

In [72]:
for data, response in zip(x[:5], y[:5]):
    print(f'data: {data}  ->  response: {response}')

data: [  1 182 188 991   3]  ->  response: 300
data: [182 188 991   3 300]  ->  response: 4
data: [188 991   3 300   4]  ->  response: 890
data: [991   3 300   4 890]  ->  response: 41
data: [  3 300   4 890  41]  ->  response: 991


In [83]:
import scipy.sparse

def to_sparse_categorical(y, num_classes):
    rows = np.arange(len(y))
    cols = y
    data = np.ones_like(y, dtype=np.float32)
    return scipy.sparse.coo_matrix((data, (rows, cols)), shape=(len(y), num_classes))

y_sparse = to_sparse_categorical(y, num_classes=vocab_size)

In [92]:
model = Sequential()

model.add(InputLayer(input_shape=(sequence_length,))) 
model.add(Embedding(vocab_size, 100)) 
model.add(LSTM(600, return_sequences=True))  
model.add(LSTM(600)) 
model.add(Dense(600, activation='relu')) 
model.add(Dense(vocab_size, activation='softmax')) 

model.summary()

In [93]:
model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(), metrics=['accuracy'])

In [94]:
early_stopping_monitor = EarlyStopping(patience=3)

In [95]:
# Sadece ilk 10,000 örneği kullanarak eğitim
x_small = x[:10000]
y_small = y[:10000]

history = model.fit(x_small, y_small, 
                    epochs=15,
                    batch_size=32,
                    callbacks=[early_stopping_monitor])


Epoch 1/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 248ms/step - accuracy: 0.0355 - loss: 7.2511
Epoch 2/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 217ms/step - accuracy: 0.0457 - loss: 6.2067
Epoch 3/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 218ms/step - accuracy: 0.0471 - loss: 6.0505
Epoch 4/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 308ms/step - accuracy: 0.0682 - loss: 5.8566
Epoch 5/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 164ms/step - accuracy: 0.0752 - loss: 5.5944
Epoch 6/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 185ms/step - accuracy: 0.0858 - loss: 5.3494
Epoch 7/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 188ms/step - accuracy: 0.0977 - loss: 5.0949
Epoch 8/15
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 198ms/step - accuracy: 0.1131 - loss: 4.8525
Epoch 9/15
[1m