<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/true_text_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import pandas as pd
import random

In [2]:
req = requests.get("https://raw.githubusercontent.com/mannmoshe/text-recognition/main/torah_heb.txt")
req.encoding = 'ISO-8859-8'
torah_text = req.text

In [3]:
torah_words = torah_text.split()

In [4]:
phrases = {}
for i in range(0, int(len(torah_words)), 3):
  phrase = ''.join(torah_words[i: i+3])
  phrases[phrase] = 1 # true text
  phrase_letters_list = [l for l in phrase]
  random.Random(4).shuffle(phrase_letters_list) 
  # Random(4) for same result every time, see https://stackoverflow.com/questions/19306976/python-shuffling-with-a-parameter-to-get-the-same-result
  phrases[''.join(phrase_letters_list)] = 0 # random text

In [5]:
phrases_dataset = pd.DataFrame.from_dict(phrases, orient='index').reset_index()

In [6]:
phrases_dataset.columns = ['text', 'label']

In [27]:
phrases_dataset.shape

(48974, 2)

In [7]:
phrases_dataset.head(10)

Unnamed: 0,text,label
0,בראשיתבראאלהים,1
1,איתאםלבהארבריש,0
2,אתהשמיםואת,1
3,אהאוםתיתמש,0
4,הארץוהארץהיתה,1
5,תיץההההרראאוץ,0
6,תהוובהווחשך,1
7,והושתחךוהבו,0
8,עלפניתהום,1
9,לפהוםתעינ,0


In [8]:
X = phrases_dataset['text']
y = phrases_dataset['label']

In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=4, stratify=y)

In [26]:
len(x_train)

32812

In [28]:
len(x_test)

16162

In [29]:
len(x_train) + len(x_test)

48974

In [10]:
train_samples = x_train.tolist()
test_samples = x_test.tolist()

In [11]:
import numpy as np
from keras.preprocessing.text import Tokenizer

In [17]:
# We create a tokenizer, configured to only take
# into account the top-10000 most common words
#tokenizer = Tokenizer(num_words=10000, char_level=True) # char_level = True --> token by letters
tokenizer = Tokenizer(char_level=True) # char_level = True --> token by letters
# This builds the word index
tokenizer.fit_on_texts(train_samples)

# This turns strings into lists of integer indices.
train_sequences = tokenizer.texts_to_sequences(train_samples)
test_sequences = tokenizer.texts_to_sequences(test_samples)

# You could also directly get the one-hot binary representations.
# Note that other vectorization modes than one-hot encoding are supported!
x_train = tokenizer.texts_to_matrix(train_samples, mode='binary')
x_test = tokenizer.texts_to_matrix(test_samples, mode='binary')

# This is how you can recover the word index that was computed
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 27 unique tokens.


In [18]:
x_train

array([[0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [25]:
x_train.shape

(32812, 28)

In [19]:
len(x_train[0])

28

In [22]:
train_samples[0]

'ישחםתמאשותיחמ'

In [21]:
x_train[0]

array([0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [20]:
sum(x_train[0])

8.0

In [32]:
word_index

{'י': 1,
 'ו': 2,
 'ה': 3,
 'א': 4,
 'ל': 5,
 'ת': 6,
 'ר': 7,
 'ב': 8,
 'ש': 9,
 'מ': 10,
 'ע': 11,
 'ם': 12,
 'נ': 13,
 'כ': 14,
 'ח': 15,
 'ד': 16,
 'ק': 17,
 'ן': 18,
 'פ': 19,
 'ך': 20,
 'צ': 21,
 'ז': 22,
 'ג': 23,
 'ס': 24,
 'ט': 25,
 'ץ': 26,
 'ף': 27}

In [33]:
lens = [len(s) for s in train_sequences]
lens[:20]

[13, 9, 11, 9, 10, 11, 15, 13, 13, 14, 12, 10, 14, 14, 10, 13, 11, 8, 11, 10]

In [34]:
sum(lens)/len(lens)

11.49652566134341

In [35]:
train_sequences[:10]

[[1, 9, 15, 12, 6, 10, 4, 9, 2, 6, 1, 15, 10],
 [1, 3, 1, 3, 8, 2, 10, 2, 12],
 [7, 3, 9, 1, 22, 17, 12, 10, 14, 7, 9],
 [12, 14, 5, 2, 3, 6, 4, 27, 24],
 [17, 8, 7, 2, 4, 6, 4, 8, 1, 2],
 [8, 10, 24, 19, 7, 9, 10, 6, 10, 8, 18],
 [6, 9, 10, 11, 2, 18, 4, 6, 3, 10, 9, 19, 25, 1, 12],
 [4, 7, 3, 2, 15, 2, 10, 7, 17, 2, 4, 7, 26],
 [9, 5, 1, 7, 20, 14, 4, 1, 19, 2, 8, 5, 10],
 [3, 8, 10, 10, 2, 2, 8, 9, 4, 15, 5, 7, 12, 6]]

In [36]:
max_len = 12
x_train_embedding = []

for s in train_sequences:
  if len(s) >= 12:
    x_train_embedding.append(s[:12])
  else:
    x_train_embedding.append(s + [0]*(max_len-len(s)))

x_train_embedding = np.array(x_train_embedding)

In [37]:
x_test_embedding = []

for s in test_sequences:
  if len(s) >= 12:
    x_test_embedding.append(s[:12])
  else:
    x_test_embedding.append(s + [0]*(max_len-len(s)))

x_test_embedding = np.array(x_test_embedding)

In [38]:
y_train = y_train.to_numpy()

In [39]:
y_train

array([0, 1, 0, ..., 0, 0, 0])

In [40]:
x_train_embedding

array([[ 1,  9, 15, ...,  6,  1, 15],
       [ 1,  3,  1, ...,  0,  0,  0],
       [ 7,  3,  9, ...,  7,  9,  0],
       ...,
       [ 1,  8,  5, ..., 14,  0,  0],
       [10,  1,  9, ..., 13,  8,  0],
       [ 7,  3,  4, ..., 13,  0,  0]])

In [41]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import SimpleRNN, Dense
from keras.layers import LSTM
from keras.layers import Dropout

In [42]:
model = Sequential()
model.add(Embedding(10000, 8, input_length=max_len))
model.add(LSTM(64))
model.compile(optimizer='rmsprop',
              loss="binary_crossentropy",
              metrics=["accuracy"])
model.add(Dense(1, activation="sigmoid"))
model.summary()

history = model.fit(x_train_embedding, 
                    y_train,
                    epochs=50,
                    batch_size=64,
                    validation_split=0.33)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 12, 8)             80000     
                                                                 
 lstm (LSTM)                 (None, 64)                18688     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 98,753
Trainable params: 98,753
Non-trainable params: 0
_________________________________________________________________
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
E

In [43]:
x_train[0]

array([0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [44]:
predicted_labels = model.predict(x_test_embedding)



In [45]:
print(type(predicted_labels),type(y_test))

<class 'numpy.ndarray'> <class 'pandas.core.series.Series'>


In [46]:
predicted_labels

array([[0.999386  ],
       [0.8073914 ],
       [0.9961702 ],
       ...,
       [0.9842679 ],
       [0.02593355],
       [0.9952114 ]], dtype=float32)

In [47]:
my_list = map(lambda x: x[0], predicted_labels)
predictions = pd.Series(my_list)

In [48]:
len(predictions)

16162

In [49]:
len(y_test)

16162

In [50]:
predictions

0        0.999386
1        0.807391
2        0.996170
3        0.997528
4        0.957968
           ...   
16157    0.479061
16158    0.048900
16159    0.984268
16160    0.025934
16161    0.995211
Length: 16162, dtype: float32

In [51]:
y_test_ri = y_test.reset_index() 

In [52]:
y_test_ri

Unnamed: 0,index,label
0,15542,1
1,14953,0
2,20832,1
3,20498,1
4,1782,1
...,...,...
16157,26075,0
16158,27721,0
16159,36666,1
16160,41767,0


In [53]:
pd.concat([predictions, y_test_ri], axis=1)

Unnamed: 0,0,index,label
0,0.999386,15542,1
1,0.807391,14953,0
2,0.996170,20832,1
3,0.997528,20498,1
4,0.957968,1782,1
...,...,...,...
16157,0.479061,26075,0
16158,0.048900,27721,0
16159,0.984268,36666,1
16160,0.025934,41767,0


In [54]:
x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.33, random_state=4, stratify=y)

In [55]:
x_test_orig

15542      מאדםועדבהמה
14953       דךמיהךטטנו
20832      ואכלואתםאשר
20498    ומשחתאתםומלאת
1782        עשבנתתילכם
             ...      
26075    הנמוגיותלבאחה
27721      ישחתעאולללו
36666      הארץכיידעתי
41767     רתושוואיתאצמ
12782       מזקןלאיוכל
Name: text, Length: 16162, dtype: object

In [56]:
pd.concat([x_test_orig, y_test], axis=1)

Unnamed: 0,text,label
15542,מאדםועדבהמה,1
14953,דךמיהךטטנו,0
20832,ואכלואתםאשר,1
20498,ומשחתאתםומלאת,1
1782,עשבנתתילכם,1
...,...,...
26075,הנמוגיותלבאחה,0
27721,ישחתעאולללו,0
36666,הארץכיידעתי,1
41767,רתושוואיתאצמ,0


In [58]:
[l for l in 'ויאמריהוהאלמשה']

['ו', 'י', 'א', 'מ', 'ר', 'י', 'ה', 'ו', 'ה', 'א', 'ל', 'מ', 'ש', 'ה']

In [59]:
[word_index[l] for l in 'ויאמריהוהאלמשה']

[2, 1, 4, 10, 7, 1, 3, 2, 3, 4, 5, 10, 9, 3]

In [66]:
string = 'ויאמריהוהאלמשה'

In [67]:
predict_me = [[word_index[l] for l in string]]

In [68]:
model.predict(predict_me)



array([[0.9947397]], dtype=float32)