In [None]:
# Import necessary libraries

import keras
import numpy as pd
import numpy as np
import tensorflow as tf

In [None]:
# pre process data from dataset
with open("manocheri_lyric.txt", "r") as f:
    data = f.read()
# seprate each mesra'
data = data.replace("#", "\n#")
corpus = data.split("#")

In [None]:
# tokenize the text and take numper of words

tokenizer = tf.keras.preprocessing.text.Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
)

tokenizer.fit_on_texts(corpus)
total_word = len(tokenizer.word_index) + 1
print(total_word)

In [None]:
# create input array and Xs , Ys

input_mesra = []
for mesra in corpus:
    token_list = tokenizer.texts_to_sequences([mesra])[0]
    for i in range(1, len(token_list)):
        broken = token_list[: i + 1]  # broke mesra to word
        input_mesra.append(broken)  # gatering words in array of mesra

In [None]:
## this will be good for gettingword to labeling
# print(input_mesra)
# [[1057, 75],
# [1057, 75, 1],
# [1057, 75, 1, 805],
# [1057, 75, 1, 805, 34],
# [1057, 75, 1, 805, 34, 1],
# [1057, 75, 1, 805, 34, 1, 1526],
# [69, 230],
# [69, 230, 806],
# [69, 230, 806, 1],
# [69, 230, 806, 1, 316],
# [69, 230, 806, 1, 316, 90],
# [69, 230, 806, 1, 316, 90, 2618],
# ...

In [None]:
# pad sequences for unifing length

max_mesra = max([len(x) for x in input_mesra])

input_mesra = np.array(
    keras.preprocessing.sequence.pad_sequences(
        sequences=input_mesra,
        maxlen=max_mesra,
        padding="pre",  # pre padding cause i think its better for labels!.
    )
)

In [None]:
# Xs and Ys
Xs = input_mesra[:, :-1]
Ys = input_mesra[:, -1]

In [None]:
# one-hoting ys
Ys = keras.utils.to_categorical(x=Ys, num_classes=total_word)

In [None]:
Xs.shape

In [None]:
# model
model = keras.models.Sequential()

model.add(keras.layers.InputLayer(shape=(18,)))

model.add(
    keras.layers.Embedding(
        total_word,
        output_dim=100,
    )
)
model.add(keras.layers.Bidirectional(keras.layers.LSTM(150, activation="relu")))

model.add(keras.layers.Dense(units=total_word, activation="softmax"))


# Display model architecture summary
model.summary()
keras.utils.plot_model(model, show_shapes=True)
model.compile(
    loss="categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["acc"],
)

In [None]:
# Train the model
#### important NOTE:
#### FIT MODEL AS ALWAYS REQUIRED FOR 4 GB of RAM
#### WHICH IS NOT AVALABLE IN MY DOCKER CONTAINER
#### NEXT CELL WILL REDEASE IT TO ~600 MB

history = model.fit(x=Xs, y=Ys, epochs=100, verbose=1, batch_size=128)

In [None]:
## NICE JOB DEEPSEEK!!!

from tensorflow.keras.utils import Sequence


class DataGenerator(Sequence):
    def __init__(self, x, y, batch_size):
        self.x = x
        self.y = y
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / self.batch_size))

    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size : (idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size : (idx + 1) * self.batch_size]
        return batch_x, batch_y


train_generator = DataGenerator(Xs, Ys, batch_size=128)

# Train the model
model.fit(train_generator, epochs=100)

In [None]:
# predic
input_text = "گقتم درین سرایم گفنی برخیز ز جایت"
next_words = 90
for i in range(next_words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]
    token_list = keras.preprocessing.sequence.pad_sequences(
        [token_list], maxlen=max_mesra, padding="pre"
    )

    ## Make predictions
    predict = model.predict(token_list, verbose=0)  # too many logs
    ## One-hot best results
    predicted_classes = np.argmax(predict, axis=1)

    output_word = ""
    # encode to word
    for word, index in tokenizer.word_index.items():
        if index == predicted_classes:
            output_word = word
            break
    input_text += " " + output_word
print("\n\n\n" + input_text)

In [None]:
# export model
model.save("manochehri.keras")