<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/text_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# based on:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python section 7 cipher
# https://www.udemy.com/data-science-natural-language-processing-in-python section 7 cipher

# Author: Moshe Mann

import numpy as np
np.seterr(divide = 'ignore') 

import string
import random
import requests

In [23]:
# for l in range(1488, 1515):
#   print(chr(l))

In [3]:
### the language model

# initialize Markov matrix
M = np.ones((27, 27))

# initial state distribution
pi = np.zeros(27)

# a function to update the Markov matrix
def update_transition(ch1, ch2):
  # ord('א') = 1488, ord('ב') = 1489, ...
  i = ord(ch1) - 1488
  j = ord(ch2) - 1488
  M[i,j] += 1

# a function to update the initial state distribution
def update_pi(ch):
  i = ord(ch) - 1488
  pi[i] += 1

# get the log-probability of a word / token
def get_word_prob(word):
  # print("word:", word)
  i = ord(word[0]) - 1488
  logp = np.log(pi[i])

  for ch in word[1:]:
    j = ord(ch) - 1488
    logp += np.log(M[i, j]) # update prob
    i = j # update j

  return logp

# get the probability of a sequence of words
def get_sequence_prob(words):
  # if input is a string, split into an array of tokens
  if type(words) == str:
    words = words.split()

  logp = 0
  for word in words:
    logp += get_word_prob(word)
  return logp

In [4]:
# with open('torah_text.txt', encoding='ISO-8859-8') as f:
#     torah_text = f.read()
req = requests.get("https://raw.githubusercontent.com/mannmoshe/text-recognition/main/torah_text.txt")
req.encoding = 'ISO-8859-8'
torah_text = req.text

In [5]:
torah_text[:50]

'  בראשית ברא אלהים את השמים ואת הארץ והארץ היתה תה'

In [6]:
tokens = torah_text.split()

for token in tokens:
  # update the model

  # first letter
  ch0 = token[0]
  update_pi(ch0)

  # other letters
  for ch1 in token[1:]:
    update_transition(ch0, ch1)
    ch0 = ch1

# normalize the probabilities
pi /= pi.sum()
M /= M.sum(axis=1, keepdims=True)

In [7]:
pi

array([0.16489595, 0.07711079, 0.00665741, 0.00687014, 0.10917145,
       0.17239179, 0.00793383, 0.01342744, 0.00314099, 0.06989025,
       0.        , 0.04297281, 0.08583299, 0.        , 0.06736244,
       0.        , 0.01804508, 0.00364155, 0.04984295, 0.        ,
       0.01278923, 0.        , 0.00644467, 0.00992354, 0.01072443,
       0.03423809, 0.0266922 ])

In [8]:
M[0]

array([8.50014875e-05, 4.13532237e-02, 6.80011900e-04, 2.27803987e-02,
       4.08432148e-02, 4.72183263e-02, 4.37757661e-03, 4.66658167e-02,
       1.70002975e-04, 4.56032981e-02, 4.29257512e-03, 2.08253644e-02,
       1.73828042e-01, 1.64902886e-02, 7.17412555e-02, 4.50507884e-03,
       2.51604403e-02, 4.84508479e-03, 2.33754091e-03, 2.25253942e-03,
       6.16260785e-03, 8.50014875e-05, 2.38004165e-03, 1.36002380e-03,
       6.20510859e-02, 1.16324536e-01, 2.35581623e-01])

In [14]:
get_sequence_prob('וידבר יהוה אל משה לאמר')

-38.291733346990895

In [16]:
len('וידבר יהוה אל משה לאמר')

22

In [20]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['תךרטט שחסט וח שכי יוףר', 'שהטבף בשלא תן יספ ואטן', 'נגץלת טקככ ףר האל וףקצ', 'יוךבנ סראא חץ צלמ כקנך', 'בחדכה ףםרכ זס םהץ נטמכ', 'אפףופ הץםע חכ נמצ הקגח', 'פהזשה פכטמ ךה זגל ודכת', 'ךשתהא כךיד דץ אסע קקגע', 'צהןבצ חןאה קצ חץל רגןץ', 'ץןכפז קתנד גנ גדפ עאפד']
[-72.32711115224632, -65.9709281637593, -inf, -73.62320329853983, -inf, -82.00289436177977, -inf, -inf, -72.65773746183035, -inf]
average_score: -inf


370
average_score: -73.00430344971377


get_sequence_prob('וידבריהוהאלמשהלאמר') is bigger than average score of a random 18 characters-long string.

In [21]:
get_sequence_prob('ואהבתם את הגר')

-27.056798620938007

In [22]:
get_sequence_prob('ואהבת לרעך כמוך')

-42.81038222930017

In [27]:
get_sequence_prob('וילך משה ויאמר אל עמו הנה פרעה משלח אתכם ואת טפכם')

-101.2397427430866

In [28]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['כתאת לנג סרזגל הש שםע לרה וגץה רמלק צסםח דעי דרףך', 'רנאג ץוו קןךקז תם חפנ טוץ הןםר כםךך יזךט ןצס יהסק', 'ףאוא ןעה הפנכם ךן שסמ יעט ירקכ נמפק מרנפ יךנ ראהט', 'עזךה חפל משדצק םנ ללו ןטנ פנזג ועץך לצתך ךםס טצשא', 'ךשעע לףס דלגטם שך בץץ חום םךנב לטכד בקפח צףי טםםת', 'פץהי חםמ ןסןרא תע ףאח זבג לוצו זתזפ ץףבמ גסף ןזקת', 'תליז פוו חנגאד הם גמץ וכפ בגהש בגתף שגזג יטה רכצט', 'ךהםח מתץ עהךאז קץ פהח בףמ דסתצ ךסתש קהרז שףר שהסי', 'פתמג נטא פעמצה שע כלצ טכף וזרן ףפיכ ךמרס וטל זמדי', 'דרנח וץצ ןעאץו ןה תשר ךןב תבקן תרקג םפשג פוק הסגק']
[-154.12234787372574, -inf, -inf, -inf, -inf, -inf, -162.50427635252774, -inf, -inf, -inf]
average_score: -inf


108
average_score: -159.6885760222407
