In [13]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau
import nltk
import re
import unicodedata
from nltk.tokenize import word_tokenize
import pyarabic.araby as araby
from collections import Counter

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [14]:
input_file = 'data.txt'
# input_file = 'cleaned_data.txt'

with open(input_file, 'r', encoding='utf-8') as infile:
    data = infile.read()

In [15]:
corpus = data.split('\n')
print(corpus[:5])  # Print first 5 lines of the corpus to check the data


['في البلد', ', اهل البلد متجمعين ومستنين الامل الجديد اللي هيخلصهم من الكابوس اللي عايشين فيه', ', مدير الشرطه: المفروض انه علي وصول', ', واحد من كبار البلد(عم محمود): بس واحد هيعمل ايه يعني؟ المفروض كانوا بعتوا فريق ولا كتيبه لكن واحد؟', ', مدير الشرطه: اللي اعرفه عن اللي باعتينه ان هو بيمثل كتيبه كامله لوحده وهو من اكفأ الظباط هو لوحده كفايه']


# Clean Text

In [16]:
def clean_text(text):
    # Remove emojis
    text = re.sub(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF]+', ' ', text) 
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra spaces
    text = re.sub(' +', ' ', text)

    
    return text

In [17]:


def tokenize_arabic(text):
    # Tokenize using NLTK's Arabic-aware tokenizer
    text = clean_text(text)
    
    tokens = word_tokenize(text)
    
    # Remove stopwords (optional)
    from nltk.corpus import stopwords
    arabic_stopwords = set(stopwords.words('arabic'))
    tokens = [token for token in tokens if token not in arabic_stopwords]
    
    return tokens

In [18]:
# corpus = [clean_text(text) for text in corpus]
corpus = corpus[:7000]# Limit to 5000 samples for demonstration
# Step 2: Tokenize the Text
tokenizer = Tokenizer() #oov_token='<oov>
tokenizer.fit_on_texts([tokenize_arabic(text) for text in corpus])
vocab_size = len(tokenizer.word_index) + 1  # Add 1 for padding token

tokenizer.word_counts

OrderedDict([('البلد', 70),
             ('اهل', 10),
             ('متجمعين', 3),
             ('ومستنين', 2),
             ('الامل', 6),
             ('الجديد', 4),
             ('اللي', 670),
             ('هيخلصهم', 1),
             ('الكابوس', 8),
             ('عايشين', 4),
             ('مدير', 18),
             ('الشرطه', 17),
             ('المفروض', 30),
             ('انه', 225),
             ('علي', 586),
             ('وصول', 2),
             ('كبار', 2),
             ('عم', 375),
             ('محمود', 393),
             ('هيعمل', 7),
             ('ايه', 471),
             ('يعني؟', 17),
             ('كانوا', 12),
             ('بعتوا', 1),
             ('فريق', 3),
             ('كتيبه', 2),
             ('واحد؟', 2),
             ('اعرفه', 7),
             ('باعتينه', 1),
             ('ان', 316),
             ('بيمثل', 1),
             ('كامله', 3),
             ('لوحده', 23),
             ('اكفأ', 4),
             ('الظباط', 1),
             ('كفايه', 58),
         

In [19]:
from collections import Counter

word_counts = Counter(tokenizer.word_counts)
print("Most common words:", word_counts.most_common(10))
print("Number of words with count ≥5:", sum(1 for cnt in word_counts.values() if cnt >= 5))
print("Number of words with count <5:", sum(1 for cnt in word_counts.values() if cnt < 5))

Most common words: [('ادهم', 2104), ('ليلي', 1683), ('مش', 1046), ('انا', 834), ('اللي', 670), ('انت', 591), ('علي', 586), ('ده', 505), ('ايه', 471), ('محمود', 393)]
Number of words with count ≥5: 1686
Number of words with count <5: 10831


In [20]:
# def optimize_vocabulary(tokenizer, min_count):
#     # Get words that meet the min_count threshold
#     kept_words = [word for word, cnt in tokenizer.word_counts.items() if cnt >= min_count]
    
#     # Rebuild tokenizer only on these words
#     new_tokenizer = Tokenizer()
#     new_tokenizer.fit_on_texts([" ".join(kept_words)])  # Simpler approach
    
#     print(f"Vocabulary reduced from {len(tokenizer.word_index)} to {len(new_tokenizer.word_index)}")
#     return new_tokenizer

# # Apply vocabulary optimization
# tokenizer = optimize_vocabulary(tokenizer, min_count=2)
# vocab_size = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(corpus)

In [21]:
vocab_size

12518

In [22]:
tokenizer.word_index

{'ادهم': 1,
 'ليلي': 2,
 'مش': 3,
 'انا': 4,
 'اللي': 5,
 'انت': 6,
 'علي': 7,
 'ده': 8,
 'ايه': 9,
 'محمود': 10,
 'عم': 11,
 'ايه؟': 12,
 'ان': 13,
 'كده': 14,
 'حاجه': 15,
 'انتي': 16,
 'علشان': 17,
 'دي': 18,
 'نقطة': 19,
 'انه': 20,
 'او': 21,
 'عايز': 22,
 'حسين': 23,
 'حنان': 24,
 'بقي': 25,
 'اي': 26,
 'حد': 27,
 'تاني': 28,
 'ليه': 29,
 'عارف': 30,
 'وانا': 31,
 'اني': 32,
 'يوم': 33,
 'الدكتور': 34,
 'عايزه': 35,
 'ام': 36,
 'ممكن': 37,
 'كانت': 38,
 'انك': 39,
 'كنت': 40,
 'اكتر': 41,
 'لحد': 42,
 'ابدا': 43,
 '؟': 44,
 'وهيا': 45,
 'يعني': 46,
 'يبقي': 47,
 'زي': 48,
 'شويه': 49,
 'انها': 50,
 'ومش': 51,
 'البيت': 52,
 'صبري': 53,
 'وبعدين': 54,
 'طول': 55,
 'مره': 56,
 'ليه؟': 57,
 'اول': 58,
 'دلوقتي': 59,
 'حتي': 60,
 'بره': 61,
 'مني': 62,
 'قوي': 63,
 'جدا': 64,
 'خلاص': 65,
 'احمد': 66,
 'طيب': 67,
 'ابوها': 68,
 'احنا': 69,
 'عليها': 70,
 'لازم': 71,
 'كمان': 72,
 'الكل': 73,
 'مصطفي': 74,
 'بعيد': 75,
 'طبعا': 76,
 'وادهم': 77,
 'كتير': 78,
 'شيئ': 79,
 'منك': 80,
 '

In [23]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# pad sequences 
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# create predictors and label
x, labels = input_sequences[:,:-1],input_sequences[:,-1]

y = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42)

In [25]:
embedding_dim = 100 # Size of word embeddings
hidden_dim = 64 # Size of LSTM hidden state

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_len-1),
    Bidirectional(LSTM(hidden_dim, return_sequences=True)),  # Bidirectional LSTM 
    Dropout(0.3),
    Bidirectional(LSTM(hidden_dim//2)),
    Dense(vocab_size, activation='softmax')   # Output layer with softmax for vocabulary
])
adam = Adam(learning_rate=0.01)
model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
epochs = 30

callbacks = [
    EarlyStopping(patience=5, monitor='val_accuracy', mode='max', verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5, verbose=1)
]

history = model.fit(X_train, y_train, epochs=epochs, validation_data=(X_val,y_val) , callbacks=callbacks, verbose=1)

Epoch 1/30
[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 142ms/step - accuracy: 0.0233 - loss: 8.4368 - val_accuracy: 0.0255 - val_loss: 8.6229 - learning_rate: 0.0100
Epoch 2/30
[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 128ms/step - accuracy: 0.0270 - loss: 8.5243 - val_accuracy: 0.0337 - val_loss: 8.7362 - learning_rate: 0.0100
Epoch 3/30
[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 130ms/step - accuracy: 0.0324 - loss: 7.7865 - val_accuracy: 0.0329 - val_loss: 8.7886 - learning_rate: 0.0100
Epoch 4/30
[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.0346 - loss: 7.6117
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
[1m1101/1101[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 132ms/step - accuracy: 0.0346 - loss: 7.6117 - val_accuracy: 0.0378 - val_loss: 8.8553 - learning_rate: 0.0100
Epoch 5/30
[1m1101/1101[0m [32m━━━━━━━━━

In [None]:
seq_len = max_sequence_len - 1

In [None]:
import sys
import numpy as np
from PyQt5.QtWidgets import (QApplication, QMainWindow, QVBoxLayout, QHBoxLayout, 
                            QLabel, QTextEdit, QPushButton, QWidget, QComboBox,QFileDialog)
from PyQt5.QtCore import Qt
from PyQt5.QtGui import QFont, QColor, QPalette

# Add these methods to your GUI class
def save_model(self):
    options = QFileDialog.Options()
    filename, _ = QFileDialog.getSaveFileName(self, "Save Model", "", "H5 Files (*.h5);;All Files (*)", options=options)
    if filename:
        self.model.save(filename)
        self.statusBar().showMessage(f"Model saved to {filename}")

def load_model(self):
    options = QFileDialog.Options()
    filename, _ = QFileDialog.getOpenFileName(self, "Load Model", "", "H5 Files (*.h5);;All Files (*)", options=options)
    if filename:
        self.model = tf.keras.models.load_model(filename)
        self.statusBar().showMessage(f"Model loaded from {filename}")

class ArabicTextPredictorGUI(QMainWindow):
    def __init__(self, model, tokenizer, seq_len):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.initUI()
        
    def initUI(self):
        # Main window settings
        self.setWindowTitle('Arabic Text Predictor')
        self.setGeometry(100, 100, 800, 600)
        
        # Set Arabic-friendly font
        font = QFont()
        font.setFamily("Arial")  # Or use "Traditional Arabic" if available
        font.setPointSize(14)
        
        # Central widget
        central_widget = QWidget()
        self.setCentralWidget(central_widget)
        
        # Main layout
        main_layout = QVBoxLayout()
        central_widget.setLayout(main_layout)
        
        # Title label
        title_label = QLabel("Arabic Text Prediction")
        title_label.setFont(QFont("Arial", 18, QFont.Bold))
        title_label.setAlignment(Qt.AlignCenter)
        main_layout.addWidget(title_label)
        
        # Text input
        self.text_input = QTextEdit()
        self.text_input.setFont(font)
        self.text_input.setPlaceholderText("Type Arabic text here...")
        self.text_input.setAlignment(Qt.AlignRight)  # Right-align for Arabic
        main_layout.addWidget(self.text_input)
        
        # Prediction controls layout
        controls_layout = QHBoxLayout()
        
        # Prediction type dropdown
        self.pred_type = QComboBox()
        self.pred_type.addItems(["Next Word", "Top 5 Suggestions"])
        self.pred_type.setFont(font)
        controls_layout.addWidget(self.pred_type)
        
        # Predict button
        predict_btn = QPushButton("Predict")
        predict_btn.setFont(font)
        predict_btn.clicked.connect(self.predict_text)
        controls_layout.addWidget(predict_btn)
        
        # Clear button
        clear_btn = QPushButton("Clear")
        clear_btn.setFont(font)
        clear_btn.clicked.connect(self.clear_text)
        controls_layout.addWidget(clear_btn)
        
        save_btn = QPushButton("Save Model")
        save_btn.setFont(font)
        save_btn.clicked.connect(self.save_model)
        controls_layout.addWidget(save_btn)

        load_btn = QPushButton("Load Model")
        load_btn.setFont(font)
        load_btn.clicked.connect(self.load_model)
        controls_layout.addWidget(load_btn)
                
        main_layout.addLayout(controls_layout)
        
        # Prediction output
        self.prediction_output = QTextEdit()
        self.prediction_output.setFont(font)
        self.prediction_output.setAlignment(Qt.AlignRight)
        self.prediction_output.setReadOnly(True)
        
        # Set background color for output
        palette = self.prediction_output.palette()
        palette.setColor(QPalette.Base, QColor(240, 240, 240))
        self.prediction_output.setPalette(palette)
        
        main_layout.addWidget(self.prediction_output)
        
        # Status bar
        self.statusBar().showMessage("Ready")
    
    def predict_text(self):
        input_text = self.text_input.toPlainText().strip()
        
        if not input_text:
            self.statusBar().showMessage("Please enter some text first")
            return
        
        try:
            # Preprocess and tokenize input
            tokens = self.tokenizer.texts_to_sequences([input_text])[0]
            
            if len(tokens) < self.seq_len:
                self.statusBar().showMessage(f"Please enter at least {self.seq_len} words")
                return
            
            # Take the last seq_len tokens
            seq = tokens[-self.seq_len:]
            padded_seq = pad_sequences([seq], maxlen=self.seq_len, padding='pre')
            
            # Make prediction
            pred_type = self.pred_type.currentText()
            
            if pred_type == "Next Word":
                predictions = self.model.predict(padded_seq, verbose=0)
                predicted_idx = np.argmax(predictions, axis=-1)[0]
                predicted_word = self.tokenizer.index_word.get(predicted_idx, "<UNK>")
                
                self.prediction_output.setPlainText(predicted_word)
                self.statusBar().showMessage("Prediction complete")
                
            elif pred_type == "Top 5 Suggestions":
                predictions = self.model.predict(padded_seq, verbose=0)[0]
                top_indices = np.argsort(predictions)[-5:][::-1]
                top_words = [self.tokenizer.index_word.get(idx, "<UNK>") for idx in top_indices]
                top_probs = [predictions[idx] for idx in top_indices]
                
                result = "\n".join([f"{word} ({prob:.2%})" for word, prob in zip(top_words, top_probs)])
                self.prediction_output.setPlainText(result)
                self.statusBar().showMessage("Top 5 predictions generated")
                
        except Exception as e:
            self.statusBar().showMessage(f"Error: {str(e)}")
            self.prediction_output.setPlainText("")
    
    def clear_text(self):
        self.text_input.clear()
        self.prediction_output.clear()
        self.statusBar().showMessage("Cleared")


app = QApplication(sys.argv)

# Set application style (optional)
app.setStyle('Fusion')

# For better Arabic text handling
app.setLayoutDirection(Qt.RightToLeft)

# Create and show the GUI
gui = ArabicTextPredictorGUI(model, tokenizer, seq_len)
gui.show()

sys.exit(app.exec_())

NameError: name 'model' is not defined

In [None]:
def predict_next_word(model, tokenizer, text, seq_len):
    # Tokenize and pad the input text
    tokenized_text = tokenizer.texts_to_sequences([text])[0]
    padded_text = pad_sequences([tokenized_text[-seq_len:]], maxlen=seq_len, padding='pre')
    
    # Predict the next word
    predictions = model.predict(padded_text)
    predicted_index = np.argmax(predictions, axis=-1)[0]
    
    # Map index back to word
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word
    return "<UNK>"

def predict_top_five_words(model, tokenizer, text, seq_len):
    # Tokenize and pad the input text
    tokenized_text = tokenizer.texts_to_sequences([text])[0]
    padded_text = pad_sequences([tokenized_text[-seq_len:]], maxlen=seq_len, padding='pre')

    # Predict the next word probabilities
    predictions = model.predict(padded_text, verbose=0)[0]

    # Get top 5 indices
    top_five_indexes = np.argsort(predictions)[::-1][:5]

    # Map indices back to words
    top_five_words = []
    for idx in top_five_indexes:
        for word, index in tokenizer.word_index.items():
            if index == idx:
                top_five_words.append(word)
                break

    return top_five_words



In [None]:
test_sentence = "   اذهب "  # Example sentence
next_word = predict_next_word(model, tokenizer, test_sentence, seq_len)
print(f"Next word after '{test_sentence}': {next_word}")
print("Top 5 predictions: ", predict_top_five_words(model, tokenizer, test_sentence, seq_len))

In [None]:
# from tensorflow.keras.models import model_from_json

# model_json = model.to_json()
# with open("lstm_model2.json", "w") as json_file:
#     json_file.write(model_json)

In [None]:
# from tensorflow.keras.models import model_from_json

# with open("lstm_model2.json", "r") as json_file:
#     loaded_model_json = json_file.read()

# # Create model from loaded architecture
# loaded_model = model_from_json(loaded_model_json)

# print("Model architecture loaded successfully from JSON file.")

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Plot Accuracy
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()