In [None]:
import string
import pickle
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split

In [None]:
turkish_stopwords = set("""
acaba ama ancak bence böyle böylece çünkü daha de defa değil eğer en hem her
hiç için ile ise kim mi nasıl ne neden niçin nitekim oysa öyle yani şu veya
ya ya da yahut ki da e da ki eğer gibi işte
""".split())

def process_file(filename):
    with open(filename, "r", encoding="utf8") as file:
        lines = file.readlines()

    data = ' '.join(lines)
    data = data.replace('\n', ' ').replace('\r', ' ').replace('\ufeff', '')
    data = data.replace('“', '').replace('”', '')

    data = data.lower()

    translator = str.maketrans('', '', string.punctuation.replace('.', ''))
    data = data.translate(translator)

    data = data.replace(':', '.').replace(';', '.').replace('?', '.').replace('!', '.').replace('...', '.').replace('..', '.').replace('-', '.')

    data = ' '.join(data.split())

    words = data.split()
    words = [word for word in words if word not in turkish_stopwords]

    return ' '.join(words)

data = process_file("fts1.txt")
data1 = process_file("calikusu.txt")

combined_data = data + " " + data1

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([combined_data])

pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
sequence_data = tokenizer.texts_to_sequences([combined_data])[0]

sequences = []
sequence_length = 5

for i in range(sequence_length, len(sequence_data)):
    words = sequence_data[i-sequence_length:i+1]
    sequences.append(words)

sequences = np.array(sequences)

X = sequences[:, :-1]
y = sequences[:, -1]

In [None]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

In [None]:
y = to_categorical(y, num_classes=vocab_size)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [None]:
# model = Sequential()
# model.add(Embedding(vocab_size, 50, input_length=3))
# model.add(LSTM(256, return_sequences=True, kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))
# model.add(LSTM(256, kernel_regularizer=l2(0.01)))
# model.add(Dropout(0.3))
# model.add(Dense(128, activation="relu", kernel_regularizer=l2(0.01)))
# model.add(Dense(vocab_size, activation="softmax"))

# model.summary()

model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=5))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer=Adam(),
              metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [None]:
history = model.fit(X, y, epochs=35, callbacks=[early_stopping], validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
    plt.figure(figsize=(12, 5))

    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    plt.show()

In [None]:
plot_history(history)

In [None]:
model.save('model.h5')

In [None]:
!cp 'model.h5' "/content/drive/MyDrive/testt/"
!cp 'token.pkl' "/content/drive/MyDrive/testt/"

In [None]:
def Predict_Next_Words(model, tokenizer, text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break

  return predicted_word

In [None]:
def word_processing(text):
  text = text.lower()
  text = text.split(" ")
  text = text[-5:]
  return text

In [None]:
text = 'Merhaba benim adım mehmet, bugün'

Predict_Next_Words(model, tokenizer, word_processing(text))

In [None]:
seed_text = 'Kütüphanede saatlerce çalıştıktan sonra biraz'
Predict_Next_Words(model, tokenizer, word_processing(seed_text))