<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/text_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# based on:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python
# https://www.udemy.com/data-science-natural-language-processing-in-python

# Author: Moshe Mann

import numpy as np
import string
import random

In [3]:
for l in range(1488, 1515):
  print(chr(l))

א
ב
ג
ד
ה
ו
ז
ח
ט
י
ך
כ
ל
ם
מ
ן
נ
ס
ע
ף
פ
ץ
צ
ק
ר
ש
ת


In [4]:
### the language model

# initialize Markov matrix
M = np.ones((27, 27))

# initial state distribution
pi = np.zeros(27)

# a function to update the Markov matrix
def update_transition(ch1, ch2):
  # ord('א') = 1488, ord('ב') = 1489, ...
  i = ord(ch1) - 1488
  j = ord(ch2) - 1488
  M[i,j] += 1

# a function to update the initial state distribution
def update_pi(ch):
  i = ord(ch) - 1488
  pi[i] += 1

# get the log-probability of a word / token
def get_word_prob(word):
  # print("word:", word)
  i = ord(word[0]) - 1488
  logp = np.log(pi[i])

  for ch in word[1:]:
    j = ord(ch) - 1488
    logp += np.log(M[i, j]) # update prob
    i = j # update j

  return logp

# get the probability of a sequence of words
def get_sequence_prob(words):
  # if input is a string, split into an array of tokens
  if type(words) == str:
    words = words.split()

  logp = 0
  for word in words:
    logp += get_word_prob(word)
  return logp

In [6]:
with open('torah_text.txt', encoding='ISO-8859-8') as f:
    torah_text = f.read()

In [7]:
torah_text[:50]

'  בראשית ברא אלהים את השמים ואת הארץ והארץ היתה תה'

In [8]:
tokens = torah_text.split()

for token in tokens:
  # update the model

  # first letter
  ch0 = token[0]
  update_pi(ch0)

  # other letters
  for ch1 in token[1:]:
    update_transition(ch0, ch1)
    ch0 = ch1

# normalize the probabilities
pi /= pi.sum()
M /= M.sum(axis=1, keepdims=True)

In [9]:
pi

array([0.16489595, 0.07711079, 0.00665741, 0.00687014, 0.10917145,
       0.17239179, 0.00793383, 0.01342744, 0.00314099, 0.06989025,
       0.        , 0.04297281, 0.08583299, 0.        , 0.06736244,
       0.        , 0.01804508, 0.00364155, 0.04984295, 0.        ,
       0.01278923, 0.        , 0.00644467, 0.00992354, 0.01072443,
       0.03423809, 0.0266922 ])

In [10]:
M[26]

array([0.02937976, 0.02448313, 0.00685528, 0.00816104, 0.10010881,
       0.13949946, 0.00369967, 0.04722524, 0.00261153, 0.16441785,
       0.02840044, 0.03014146, 0.0171926 , 0.11675734, 0.02415669,
       0.04069641, 0.04047878, 0.00402612, 0.03166485, 0.00097933,
       0.00914037, 0.00032644, 0.00402612, 0.02143634, 0.03405876,
       0.0336235 , 0.03645267])

In [91]:
get_sequence_prob('וידבריהוהאלמשהלאמר')

-38.291733346990895

In [25]:
len('וידבריהוהאלמשהלאמר')

18

In [90]:
import random
import string
np.seterr(divide = 'ignore') 

# printing lowercase
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(18)) # len('וידבריהוהאלמשהלאמר') equals 18
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts)
print(random_scores)
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['דצאתגתלסלשקמהפץךקז', 'הסשאגחעגפלגשץלשצץש', 'תןהעיאהזץאסרצחזעטי', 'תןםחבךרךכנחקכנדכבק', 'לנזדחאףטקתץלצץטתץך', 'חלחקשןףןאהאסשףנןדף', 'שץיחחצמשמברהבטזעךף', 'רשןקצךההךצבכדןדאדנ', 'פולחךנףאאזהרגףדבגכ', 'שןגקקולואדיםךץעךחט', 'הרדשםבגץדחודץיטגרע', 'מףקישטזטןגצפדזםזטץ', 'ןנעדםקטתמדקתקבדדצה', 'פץפבתץפףאוףמתככבאז', 'זנרכדסטםעוךלנהעצקת', 'תגפוהטכזנדץרששחגסע', 'עסהמעףנלםבלאברחףשש', 'סזרילולגנמסחאדתעחצ', 'בימלקגטחספחרתגסדכמ', 'שחלנדטארהיחןמצגצאט', 'מוהשמוומאפטבאאללופ', 'שלןדחזטסךתציגיפזשה', 'הנפהקבגמחסספבםאןפמ', 'תםדאטןטםברםבףךדשכט', 'גסתץדץמימוצהחםצויכ', 'חתנזךלחךהלכמןדיםגש', 'צךמןאתחגגסןזףרהצסץ', 'יחקןתםברלשהאאחץצזע', 'וההץקטכחםפמכתאוחתכ', 'ינכשסנאשהףתשנכדגךא', 'טיץעץצמףאפףבמףזאךר', 'שצאיגעכנלפכריצריךן', 'רךקאאגףודכףסביץהדל', 'גכותםעזיבקרלרךרשבג', 'פלץלבצהםנףץדשנסחנס', 'רנפזזסוםקסעהךתלסזה', 'תבכיהזכעףםןבפםהפבפ', 'צבןפאעמרעקבהפבעץךק', 'כןפכשגטקמעסשנגהנתס', 'הגוזאץטטץפהאוץעטהן', 'געתףהפעספמיפכרךאטע', 'דרצםגפוגטךעסץץפאתך', 'פבפץםברטףסךתוקרבגה', 'ואזנףךהשלוינזכשסצב', 'ווםתאמוכאףיןרשיאבה', 'יךו

get_sequence_prob('וידבריהוהאלמשהלאמר') is bigger than average score of a random 18 characters-long string.

In [79]:
get_sequence_prob('ואהבתם את הגר')

-27.056798620938007

In [80]:
get_sequence_prob('ואהבת לרעך כמוך')

-42.81038222930017