<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/true_text_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import random

In [2]:
req = requests.get("https://raw.githubusercontent.com/mannmoshe/text-recognition/main/torah_heb.txt")
req.encoding = 'ISO-8859-8'
torah_text = req.text

In [3]:
torah_words = torah_text.split()

In [4]:
phrases = {}
for i in range(0, int(len(torah_words)), 3):
  phrase = ''.join(torah_words[i: i+3])
  phrases[phrase] = 1 # true text
  phrase_letters_list = [l for l in phrase]
  random.Random(4).shuffle(phrase_letters_list) 
  # Random(4) for same result every time, see https://stackoverflow.com/questions/19306976/python-shuffling-with-a-parameter-to-get-the-same-result
  phrases[''.join(phrase_letters_list)] = 0 # random text

In [5]:
phrases_dataset = pd.DataFrame.from_dict(phrases, orient='index').reset_index()

In [6]:
phrases_dataset.columns = ['text', 'label']

In [7]:
phrases_dataset.head(10)

Unnamed: 0,text,label
0,בראשיתבראאלהים,1
1,איתאםלבהארבריש,0
2,אתהשמיםואת,1
3,אהאוםתיתמש,0
4,הארץוהארץהיתה,1
5,תיץההההרראאוץ,0
6,תהוובהווחשך,1
7,והושתחךוהבו,0
8,עלפניתהום,1
9,לפהוםתעינ,0


In [8]:
X = phrases_dataset['text']
y = phrases_dataset['label']

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4, stratify=y)

In [10]:
train_samples = x_train.tolist()
test_samples = x_test.tolist()

In [11]:
import numpy as np
from keras.preprocessing.text import Tokenizer

In [12]:
# We create a tokenizer, configured to only take
# into account the top-10000 most common words
tokenizer = Tokenizer(num_words=10000, char_level=True)
# This builds the word index
tokenizer.fit_on_texts(train_samples)

# This turns strings into lists of integer indices.
train_sequences = tokenizer.texts_to_sequences(train_samples)
test_sequences = tokenizer.texts_to_sequences(test_samples)

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
x_train = tokenizer.texts_to_matrix(train_samples, mode='binary')
x_test = tokenizer.texts_to_matrix(test_samples, mode='binary')

# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 27 unique tokens.


In [13]:
word_index

{'י': 1,
 'ו': 2,
 'ה': 3,
 'א': 4,
 'ל': 5,
 'ת': 6,
 'ר': 7,
 'ב': 8,
 'ש': 9,
 'מ': 10,
 'ע': 11,
 'ם': 12,
 'נ': 13,
 'כ': 14,
 'ח': 15,
 'ד': 16,
 'ק': 17,
 'ן': 18,
 'פ': 19,
 'ך': 20,
 'צ': 21,
 'ז': 22,
 'ג': 23,
 'ס': 24,
 'ט': 25,
 'ץ': 26,
 'ף': 27}

In [14]:
lens = [len(s) for s in train_sequences]
lens[:20]

[13, 9, 11, 9, 10, 11, 15, 13, 13, 14, 12, 10, 14, 14, 10, 13, 11, 8, 11, 10]

In [15]:
sum(lens)/len(lens)

11.49652566134341

In [16]:
max_len = 12
x_train_embedding = []

for s in train_sequences:
  if len(s) >= 12:
    x_train_embedding.append(s[:12])
  else:
    x_train_embedding.append(s + [0]*(max_len-len(s)))

x_train_embedding = np.array(x_train_embedding)

In [17]:
x_test_embedding = []

for s in test_sequences:
  if len(s) >= 12:
    x_test_embedding.append(s[:12])
  else:
    x_test_embedding.append(s + [0]*(max_len-len(s)))

x_test_embedding = np.array(x_test_embedding)

In [18]:
y_train = y_train.to_numpy()

In [19]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

In [20]:
x_train_embedding

array([[ 1,  9, 15, ...,  6,  1, 15],
       [ 1,  3,  1, ...,  0,  0,  0],
       [ 7,  3,  9, ...,  7,  9,  0],
       ...,
       [ 1,  8,  5, ..., 14,  0,  0],
       [10,  1,  9, ..., 13,  8,  0],
       [ 7,  3,  4, ..., 13,  0,  0]])

In [21]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import SimpleRNN, Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [22]:
model = Sequential()
model.add(Embedding(10000, 8, input_length=max_len))
model.add(LSTM(64))
model.compile(optimizer='rmsprop',
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.add(Dense(1, activation="sigmoid"))
model.summary()

history = model.fit(x_train_embedding, 
                    y_train,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.33)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 8)             80000     
                                                                 
 lstm (LSTM)                 (None, 64)                18688     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 98,753
Trainable params: 98,753
Non-trainable params: 0
_________________________________________________________________
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
E

In [23]:
x_train[0]

array([0., 1., 1., ..., 0., 0., 0.])

In [24]:
predicted_labels = model.predict(x_test_embedding)



In [25]:
print(type(predicted_labels),type(y_test))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


In [26]:
predicted_labels

array([[0.9547584 ],
       [0.18810605],
       [0.8858482 ],
       ...,
       [0.90227455],
       [0.01199283],
       [0.942062  ]], dtype=float32)

In [27]:
my_list = map(lambda x: x[0], predicted_labels)
predictions = pd.Series(my_list)

In [28]:
pd.concat([predictions, y_test], axis=1)

Unnamed: 0,0,label
0,0.954758,
1,0.188106,
2,0.885848,
3,0.980954,
4,0.726101,
...,...,...
48951,,0.0
48952,,1.0
48953,,0.0
48954,,1.0
