In [1]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Function to load data from files
def load_data(directory):
    texts = []
    labels = []
    for label in os.listdir(directory):
        class_folder = os.path.join(directory, label)
        if os.path.isdir(class_folder):
            for fname in os.listdir(class_folder):
                with open(os.path.join(class_folder, fname), 'r', encoding='utf-8') as file:
                    texts.append(file.read())
                    labels.append(label)
    return texts, labels

# Load training and test data
train_texts, train_labels = load_data('C:\\Users\\Moham\\Downloads\\57zpx667y9-2\\SANAD_SUBSET\\khaleej\\Train')
test_texts, test_labels = load_data('C:\\Users\\Moham\\Downloads\\57zpx667y9-2\\SANAD_SUBSET\\khaleej\\Test')

In [3]:
# Encode labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)


In [4]:
# Parameters
vocab_size = 10000
embedding_dim = 16
max_length = 128
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"

In [5]:
# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(train_texts)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_texts)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [6]:
# Build the model
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

# Train the model
model.fit(train_padded, train_labels, epochs=10, validation_data=(test_padded, test_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1c99da8c650>

In [7]:
# Evaluate the model
loss, accuracy = model.evaluate(test_padded, test_labels)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy * 100:.2f}%")

Test Loss: 0.1507, Test Accuracy: 96.33%


In [9]:
model.save('C:\\Users\\Moham\\OneDrive\\Desktop\\OCR\\OCR_model.h5')

  saving_api.save_model(


In [10]:
model = tf.keras.models.load_model('C:\\Users\\Moham\\OneDrive\\Desktop\\OCR\\OCR_model.h5')

In [11]:
def classify_text_file(file_path, tokenizer, model, max_length=128):
    # Read the text file
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()

    # Tokenize and pad the text
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

    # Predict the class
    prediction = model.predict(padded)
    predicted_class = label_encoder.classes_[np.argmax(prediction)]

    return predicted_class

In [12]:
# Usage example
file_path = 'C:\\Users\\Moham\\OneDrive\\Desktop\\OCR\\tech_stuff.txt'
predicted_class = classify_text_file(file_path, tokenizer, model)
print("Predicted Class:", predicted_class)

Predicted Class: Tech
