<a href="https://colab.research.google.com/github/omiid-n99/Persian-Text-Prediction/blob/main/Text_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SimpleRNN, GRU
from tensorflow.keras.callbacks import EarlyStopping
# !pip install hazm
from hazm import *
import re
from google.colab import drive

# Constants
TEXT_CLEANING_RE = r"[^آ-ی]"
DATASET_ENCODING = "utf-8"

In [None]:
# Load and preprocess the dataset
def load_dataset(file_path):
    # Load dataset
    drive.mount('/content/drive')
    df = pd.read_csv(file_path,encoding =DATASET_ENCODING)
    df['text'].apply(preprocess_text)
    return df

In [None]:
# Preprocess text
def preprocess_text(text):
    text = text.replace('.', ' ')
    text = text.replace('ي', 'ی').replace('ك', 'ک')
    text = text.replace("-","")
    text = text.replace('(', ' ')
    text = text.replace(')', ' ')
    text = text.replace("«"," ")
    text = text.replace("»"," ")

    cleaned_text = re.sub(TEXT_CLEANING_RE, " ", text)
    return cleaned_text

In [None]:
def get_sequence_of_tokens(tokens):
  input_sequences = []
  output_sequences = []

  for seq in tokens:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i])
        output_sequences.append(seq[i])
  return input_sequences, output_sequences

In [None]:
# Load the dataset
df = load_dataset('/content/drive/My Drive/RNN Project/part_1.csv')

Mounted at /content/drive


In [None]:
# Create a tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index
total_words = len(word_index) + 1


In [None]:
# Maximum sequence length
max_sequence_length = max([len(x.split()) for x in df['text']])

# Tokenize and pad the text sequences
sequences = tokenizer.texts_to_sequences(df['text'])
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
# Create training data
input_sequences, output_sequences = get_sequence_of_tokens(sequences)

x_train = pad_sequences(input_sequences, maxlen=max_sequence_length-1)
y_train = to_categorical(output_sequences, num_classes=total_words)

In [None]:
# Create the model
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_sequence_length - 1))
model.add(LSTM(128))
# model.add(GRU(64, activation="tanh"))
model.add(Dense(total_words, activation='softmax'))

In [None]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
# Define early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [None]:
# Train the model
# model.fit(x_train, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stop])

model.fit(x_train, y_train, validation_split=0.2, epochs=20, batch_size=32, verbose=1)

Epoch 1/20


In [None]:
def predict_next_words(model, tokenizer, max_sequence_length, text, num_words=1):
    # Preprocess the input text
    cleaned_text = preprocess_text(text)

    # Tokenize and pad the preprocessed text
    text_tokenized = tokenizer.texts_to_sequences([cleaned_text])
    text_padded = pad_sequences(text_tokenized, maxlen=max_sequence_length - 1, padding='post')

    # Predict the label
    prediction = model.predict(text_padded)

    # Get the indices of the top predicted words
    top_indices = np.argsort(prediction[0])[-num_words:]

    # Convert the predicted word indices back to words
    predicted_words = [tokenizer.index_word.get(idx, '') for idx in top_indices]

    return predicted_words

# User input function
def user_input(model, tokenizer, max_sequence_length):
    while True:
        text = input("Enter a broken sentence (type 'q' to quit): ")
        if text == 'q':
            break
        num_words = int(input("Enter the number of words to predict: "))
        predicted_words = predict_next_words(model, tokenizer, max_sequence_length, text, num_words)

        # Combine the input sentence with the predicted words to form a complete sentence
        completed_sentence = text + ' ' + ' '.join(predicted_words)
        print("Completed Sentence:", completed_sentence)

# Start user input loop
user_input(model, tokenizer, max_sequence_length)


# def generate_sequence(model, tokenizer, max_sequence_length, seed_text, sequence_length):
#     # Preprocess the seed text
#     cleaned_text = preprocess_text(seed_text)

#     # Tokenize and pad the preprocessed text
#     seed_tokenized = tokenizer.texts_to_sequences([cleaned_text])
#     seed_padded = pad_sequences(seed_tokenized, maxlen=max_sequence_length - 1, padding='post')

#     generated_sequence = seed_padded.copy()

#     for _ in range(sequence_length):
#         # Predict the next word
#         prediction = model.predict(seed_padded)
#         predicted_word_idx = np.argmax(prediction[0])

#         # Append the predicted word index to the generated sequence
#         generated_sequence[0][-1] = predicted_word_idx

#         # Update the seed for the next iteration
#         seed_padded = generated_sequence

#     # Convert the generated sequence back to text
#     generated_text = ' '.join(tokenizer.index_word.get(idx, '') for idx in generated_sequence[0])
#     return generated_text

# def user_input(model, tokenizer, max_sequence_length):
#     while True:
#         text = input("Enter a seed text (type 'q' to quit): ")
#         if text == 'q':
#             break
#         sequence_length = int(input("Enter the number of sequence length: "))
#         generated_sequence = generate_sequence(model, tokenizer, max_sequence_length, text, sequence_length)
#         print("Generated Sequence:", generated_sequence)

# # Example usage:
# user_input(model, tokenizer, max_sequence_length)
