<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/text_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# based on:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python 
# https://www.udemy.com/data-science-natural-language-processing-in-python Section 7 Cipher Decryption

# Author: Moshe Mann

import numpy as np
np.seterr(divide = 'ignore') 

import string
import random
import requests
import pandas as pd

In [13]:
for l in range(1488, 1515):
  print(l, l - 1488, chr(l))

1488 0 א
1489 1 ב
1490 2 ג
1491 3 ד
1492 4 ה
1493 5 ו
1494 6 ז
1495 7 ח
1496 8 ט
1497 9 י
1498 10 ך
1499 11 כ
1500 12 ל
1501 13 ם
1502 14 מ
1503 15 ן
1504 16 נ
1505 17 ס
1506 18 ע
1507 19 ף
1508 20 פ
1509 21 ץ
1510 22 צ
1511 23 ק
1512 24 ר
1513 25 ש
1514 26 ת


In [25]:
### the language model

# initialize Markov matrix
# MM: matrix for the bi-gram probabilites
M = np.ones((22, 22))

# initial state distribution
# MM: vector for the uni-gram probabilites
# MM: uni-gram probabilites calculated only by first letters of the words
pi = np.zeros(22)

# a function to update the Markov matrix
def update_transition(ch1, ch2):
  # ord('א') = 1488, ord('ב') = 1489, ...
  if ord(ch1) <= 1497:
    i = ord(ch1) - 1488 # 'א' will mapped to 0, 'ב' will mapped to 1 and so on
  elif ord(ch1) <= 1500:
    i = ord(ch1) - 1489
  elif ord(ch1) <= 1502:
    i = ord(ch1) - 1490
  elif ord(ch1) <= 1506:
    i = ord(ch1) - 1491
  elif ord(ch1) <= 1508:
    i = ord(ch1) - 1492
  else:
    i = ord(ch1) - 1493

  if ord(ch2) <= 1497:
    j = ord(ch2) - 1488 
  elif ord(ch2) <= 1500:
    j = ord(ch2) - 1489
  elif ord(ch2) <= 1502:
    j = ord(ch2) - 1490
  elif ord(ch2) <= 1506:
    j = ord(ch2) - 1491
  elif ord(ch2) <= 1508:
    j = ord(ch2) - 1492
  else:
    j = ord(ch2) - 1493

  M[i,j] += 1

# a function to update the initial state distribution
def update_pi(ch):
  if ord(ch) <= 1497:
    i = ord(ch) - 1488 
  elif ord(ch) <= 1500:
    i = ord(ch) - 1489
  elif ord(ch) <= 1502:
    i = ord(ch) - 1490
  elif ord(ch) <= 1506:
    i = ord(ch) - 1491
  elif ord(ch) <= 1508:
    i = ord(ch) - 1492
  else:
    i = ord(ch) - 1493

  pi[i] += 1

# get the log-probability of a word / token
def get_word_prob(word):
  # print("word:", word)
  if ord(word[0]) <= 1497:
    i = ord(word[0]) - 1488 
  elif ord(word[0]) <= 1500:
    i = ord(word[0]) - 1489
  elif ord(word[0]) <= 1502:
    i = ord(word[0]) - 1490
  elif ord(word[0]) <= 1506:
    i = ord(word[0]) - 1491
  elif ord(word[0]) <= 1508:
    i = ord(word[0]) - 1492
  else:
    i = ord(word[0]) - 1493

  logp = np.log(pi[i])

  for ch in word[1:]:
    if ord(ch) <= 1497:
      j = ord(ch) - 1488 
    elif ord(ch) <= 1500:
      j = ord(ch) - 1489
    elif ord(ch) <= 1502:
      j = ord(ch) - 1490
    elif ord(ch) <= 1506:
      j = ord(ch) - 1491
    elif ord(ch) <= 1508:
      j = ord(ch) - 1492
    else:
      j = ord(ch) - 1493
    logp += np.log(M[i, j]) # update prob
    i = j # update i to j

  return logp

# get the probability of a sequence of words
def get_sequence_prob(words):
  # if input is a string, split into an array of tokens
  if type(words) == str:
    words = words.split()

  logp = 0
  for word in words:
    logp += get_word_prob(word)
  return logp

In [26]:
# with open('torah_text.txt', encoding='ISO-8859-8') as f:
#     torah_text = f.read()
req = requests.get("https://raw.githubusercontent.com/mannmoshe/text-recognition/main/torah_heb.txt")
req.encoding = 'ISO-8859-8'
torah_text = req.text

In [27]:
torah_text[:50]

'בראשית ברא אלהים את השמים ואת הארץ והארץ היתה תהו '

In [28]:
torah_text = torah_text.replace('ם','מ').replace('ן','נ').replace('ץ','צ').replace('ף','פ').replace('ך','כ')

In [29]:
torah_text[:50]

'בראשית ברא אלהימ את השמימ ואת הארצ והארצ היתה תהו '

In [30]:
torah_text_no_spaces = torah_text.replace(' ', '')

In [31]:
torah_text_no_spaces[:50]

'בראשיתבראאלהימאתהשמימואתהארצוהארצהיתהתהוובהווחשכעל'

In [32]:
len(torah_text_no_spaces)

304805

מספר הפעמים שהאות א מופיעה שלא בסוף המילה

In [33]:
torah_text.count('א') - torah_text.count('א ')

23506

In [34]:
torah_text.count('אב')

973

In [35]:
tokens = torah_text.split()

for token in tokens:
  # update the model

  # first letter
  ch0 = token[0]
  update_pi(ch0)

  # other letters
  for ch1 in token[1:]:
    update_transition(ch0, ch1)
    ch0 = ch1

pi_initial_count = pi.copy()
M_initial_count = M.copy()

# normalize the probabilities
pi /= pi.sum()
M /= M.sum(axis=1, keepdims=True)

first letter distribution:

In [36]:
pi_initial_count

array([13180.,  6167.,   532.,   549.,  8730., 13784.,   634.,  1073.,
         251.,  5588.,  3434.,  6868.,  5387.,  1443.,   292.,  3988.,
        1022.,   515.,   793.,   857.,  2739.,  2138.])

The probabilties vector of first letters in word:

In [37]:
pi

array([0.16482417, 0.0771222 , 0.00665299, 0.00686559, 0.10917413,
       0.17237757, 0.00792857, 0.01341854, 0.00313891, 0.06988145,
       0.04294432, 0.08588865, 0.06736782, 0.01804562, 0.00365164,
       0.04987244, 0.01278075, 0.0064404 , 0.00991696, 0.01071732,
       0.03425291, 0.02673703])

In [38]:
pi.sum()

1.0000000000000002

In [39]:
M_initial_count[0]

array([2.000e+00, 9.740e+02, 1.600e+01, 5.360e+02, 9.610e+02, 1.111e+03,
       1.030e+02, 1.098e+03, 4.000e+00, 1.073e+03, 5.900e+02, 4.090e+03,
       2.075e+03, 6.970e+02, 1.140e+02, 5.500e+01, 1.970e+02, 5.700e+01,
       3.200e+01, 1.460e+03, 2.740e+03, 5.543e+03])

In [40]:
M_initial_count[0].sum()

23528.0

The probabilties matrix of bi-gram where the first letter is: א

In [41]:
M[0]

array([8.50051003e-05, 4.13974838e-02, 6.80040802e-04, 2.27813669e-02,
       4.08449507e-02, 4.72203332e-02, 4.37776267e-03, 4.66678001e-02,
       1.70010201e-04, 4.56052363e-02, 2.50765046e-02, 1.73835430e-01,
       8.81927916e-02, 2.96242775e-02, 4.84529072e-03, 2.33764026e-03,
       8.37300238e-03, 2.42264536e-03, 1.36008160e-03, 6.20537232e-02,
       1.16456987e-01, 2.35591635e-01])

In [42]:
M.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.])

In [43]:
get_sequence_prob('וידבר יהוה אל משה לאמר')

-38.08075613561019

In [44]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['עקזזת חחסת פט וצך גסלל', 'קסץםע ברבס כץ ךגט חמכז', 'זתגמך בזאד רם אככ חכבך', 'נבוטע תםתח ךב מגר ךטדף', 'סחהםו דחבץ ץר ךמק ךפןמ', 'שףכיצ רהאס זח חכח הןדת', 'ץסנאב מפננ יש גרכ נחדל', 'דדףףץ גרעס וס ךהף לךקר', 'עהבחי תשטש בח פאו תןאפ', 'נךאםק נךזב ןמ נעי רכנכ']
[-78.76061497113783, -70.15947862869092, -66.74211357771917, -67.38412695332403, -69.7908248305291, -68.95308167531728, -61.6216349017445, -69.05754787578073, -63.51701105288446, -56.79361307542721]
average_score: -68.79265595790893


1000
average_score: -68.79265595790893


get_sequence_prob('וידבר יהוה אל משה לאמר') is bigger than average score of a sentence with a same structure.

In [45]:
get_sequence_prob('ואהבתם את הגר')

-28.972525732208393

In [46]:
get_sequence_prob('ואהבת לרעך כמוך')

-35.08222597210341

In [47]:
get_sequence_prob('וילך משה ויאמר אל עמו הנה פרעה משלח אתכם ואת טפכם')

-95.7389730547296

In [48]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['ענףע גבק סדאעכ גי נדי םםך וקמץ חךהנ פטקד ותש וחךץ', 'ץהרב הסץ אתםצט זט וסע ףףם תתטפ עסוך פםעמ םזר טטגפ', 'טמדז העם טעשטח גק דףס כדך בבהר ןתשי יךאת פהא חןקך', 'טיצצ ץזך םקיעמ שד מרה גוט פזףז תצבג לצגו לסב השץח', 'טןסץ בהך דטזהכ פך ףסז חיא טאץך םףפצ חםץד ןלפ ףכצפ', 'וךטפ סלל ךטננמ ףצ הץח שדך ההץכ העתט נדגד םךג ףאזף', 'יחףז גוד עתפיח ןח הןף רפכ זסקר נואט ההףת געכ אאגץ', 'כםתי שץמ יעץצג וף מעץ הממ טםןי בפדח לפגץ עהן ולפע', 'ךצצמ צבע לךרספ טח נמש ףעך עחםה האהק מןשי משה רסןפ', 'וחףג ףוס יםםדר הת פטי עאת דזץא וצתז דפקש ףדר םסףג']
[-148.19915780556218, -155.14555719747287, -147.2225963840797, -149.58377229067332, -152.76880080217387, -144.20180712077808, -157.02750409373337, -136.59485842419855, -141.25372996807195, -157.84873910473107]
average_score: -148.69888593184297


1000
average_score: -148.69888593184297


In [49]:
torah_text[:50]

'בראשית ברא אלהימ את השמימ ואת הארצ והארצ היתה תהו '

In [50]:
torah_text_no_spaces = torah_text.strip().replace(" ", "").replace("  ", "")

In [51]:
print(torah_text_no_spaces[:50]+'\n')
print(torah_text_no_spaces[-50:]+'\n')
print(len(torah_text_no_spaces))

בראשיתבראאלהימאתהשמימואתהארצוהארצהיתהתהוובהווחשכעל

רצוולכלהידהחזקהולכלהמוראהגדולאשרעשהמשהלעיניכלישראל

304805


Check if spaces affect the probability:

In [52]:
get_sequence_prob('אבגד הוז חטיכ')

-32.62795532970536

In [53]:
get_sequence_prob('אבג דהו זחטיכ')

-37.72760701347405

In [54]:
torah_skips_2_s0 = ''
for i in range(0, len(torah_text_no_spaces), 2):
  torah_skips_2_s0 += torah_text_no_spaces[i]

In [55]:
torah_skips_2_s1 = ''
for i in range(1, len(torah_text_no_spaces), 2):
  torah_skips_2_s1 += torah_text_no_spaces[i]

In [56]:
torah_skips_2_s0[:100]

'באיבאליאהממאהרואציההוהושעפיהמרחלימחתלנהיוארליייווהאריאליאהוכטבידאהמיהוויהשוקאלילוימלשקאיהיירוהברואדי'

In [57]:
torah_skips_2_s1[:100]

'רשתראהמתשיותאצהרהתתובוחכלנתווואהמרפעפיממימאהמהארייווראהמתאריוובלליבנארבנחכיראהמארווחכרללוהעבייקימחוא'

Trying recognize true text 20,15,10 characters long in skips of 2.

In [58]:
words_probs_20 = {}

for i in range(len(torah_skips_2_s0) - 20):
  word = torah_skips_2_s0[i:i + 20]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_20[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+20]

words_probs_20 = dict(sorted(words_probs_20.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_20))

151007


In [59]:
for i in range(len(torah_skips_2_s1) - 20):
  word = torah_skips_2_s1[i:i + 20]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_20[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+21]

words_probs_20 = dict(sorted(words_probs_20.items(), key=lambda item: item[1], reverse=True))
words_probs_20

{'ליהמויההויהיהאלויהיא': [-43.39100076161104,
  'torah_skips_2_s0',
  157534,
  157554],
 'המויההויהיהאלויהיארא': [-43.91352412422244,
  'torah_skips_2_s0',
  157538,
  157558],
 'יהמויההויהיהאלויהיאר': [-44.332191175606525,
  'torah_skips_2_s0',
  157536,
  157556],
 'אתויוהולקדבישאליאתמנ': [-44.444345360472404,
  'torah_skips_2_s0',
  188306,
  188326],
 'ותוימולמראשמנכיאשריו': [-44.652950197508325,
  'torah_skips_2_s0',
  50324,
  50344],
 'ואיריבכלהאלכלהוארוני': [-44.994050665761634,
  'torah_skips_2_s0',
  54740,
  54760],
 'ויהעואלארהלכימויהאמש': [-45.016618628157396,
  'torah_skips_2_s0',
  97962,
  97982],
 'השרימלאלכימרתיימאימש': [-45.35247503649432,
  'torah_skips_2_s0',
  47868,
  47888],
 'ליאתויוהולקדבישאליאת': [-45.4057574869526,
  'torah_skips_2_s0',
  188302,
  188322],
 'הימתומיואימתותוהנימת': [-45.46338002224253,
  'torah_skips_2_s0',
  216196,
  216216],
 'הוימיואמהוימהירואואנ': [-45.49838266042691,
  'torah_skips_2_s1',
  214879,
  214899],
 'וימולמראשמנכיאשריויל': 

In [60]:
print(len(words_probs_20))

300891


In [61]:
words_probs_15 = {}

for i in range(len(torah_skips_2_s0) - 15):
  word = torah_skips_2_s0[i:i + 15]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_15[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+15]

words_probs_15 = dict(sorted(words_probs_15.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_15))

150323


In [62]:
for i in range(len(torah_skips_2_s1) - 15):
  word = torah_skips_2_s1[i:i + 15]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_15[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+16]

words_probs_15 = dict(sorted(words_probs_15.items(), key=lambda item: item[1], reverse=True))
words_probs_15

{'ויההויהיהאלויהי': [-31.039627496516065, 'torah_skips_2_s0', 157542, 157557],
 'הויהיהאלויהיארא': [-31.36285747888547, 'torah_skips_2_s0', 157548, 157563],
 'הוימיואמהוימהיר': [-32.25722839460868, 'torah_skips_2_s1', 214879, 214894],
 'ליהמויההויהיהאל': [-32.30828286116755, 'torah_skips_2_s0', 157534, 157549],
 'בכלהאלכלהוארוני': [-32.31889433956828, 'torah_skips_2_s0', 54750, 54765],
 'ואימתותוהנימתוי': [-32.3962929842486, 'torah_skips_2_s0', 216210, 216225],
 'המויההויהיהאלוי': [-32.45078375645626, 'torah_skips_2_s0', 157538, 157553],
 'ארויואימימהנימש': [-32.528149355371134, 'torah_skips_2_s0', 214242, 214257],
 'ויוישיוהשישהנימ': [-32.71777806385703, 'torah_skips_2_s0', 45528, 45543],
 'וימיואמהוימהירו': [-32.71834615866266, 'torah_skips_2_s1', 214881, 214896],
 'מויההויהיהאלויה': [-32.72469921074394, 'torah_skips_2_s0', 157540, 157555],
 'ואלאהמימיואמהאל': [-33.010277214699975, 'torah_skips_2_s0', 89580, 89595],
 'עהוימיואמהוימהי': [-33.040337634351495, 'torah_skips_2_s1', 214877

In [63]:
words_probs_10 = {}

for i in range(len(torah_skips_2_s0) - 10):
  word = torah_skips_2_s0[i:i + 10]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_10[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+10]

words_probs_10 = dict(sorted(words_probs_10.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_10))

148003


In [64]:
for i in range(len(torah_skips_2_s1) - 10):
  word = torah_skips_2_s1[i:i + 10]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_10[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+11]

words_probs_10 = dict(sorted(words_probs_10.items(), key=lambda item: item[1], reverse=True))
words_probs_10

{'וימהוהותימ': [-19.35016073586796, 'torah_skips_2_s1', 86927, 86937],
 'ויהיהאלויה': [-19.61463533113735, 'torah_skips_2_s0', 157550, 157560],
 'ויהוהמלהוא': [-19.795910867160075, 'torah_skips_2_s1', 173289, 173299],
 'הויהיהאלוי': [-19.90011711111929, 'torah_skips_2_s0', 157548, 157558],
 'כלהאלכלהוא': [-20.108641137730658, 'torah_skips_2_s0', 54752, 54762],
 'אתמהוהאשרב': [-20.346834035777487, 'torah_skips_2_s1', 288207, 288217],
 'וימיואמהוי': [-20.508117577987633, 'torah_skips_2_s1', 214881, 214891],
 'ותוהנימתוי': [-20.53264518744649, 'torah_skips_2_s0', 216220, 216230],
 'והניהעימרא': [-20.53998821176989, 'torah_skips_2_s1', 216297, 216307],
 'וימתהוהימה': [-20.57601406996584, 'torah_skips_2_s0', 23188, 23198],
 'ויאתוישיני': [-20.63997889810845, 'torah_skips_2_s1', 23129, 23139],
 'הואהאשרימר': [-20.708308266785515, 'torah_skips_2_s0', 250970, 250980],
 'ברביוהוימש': [-20.76127436468196, 'torah_skips_2_s1', 161741, 161751],
 'יהיהאלויהי': [-20.798552066793178, 'torah_skips_2_s0

In [65]:
print(len(words_probs_10))

291390


In [66]:
df_10 = pd.DataFrame.from_dict(words_probs_10, orient='index').reset_index()

In [67]:
df_10.head()

Unnamed: 0,index,0,1,2,3
0,וימהוהותימ,-19.350161,torah_skips_2_s1,86927,86937
1,ויהיהאלויה,-19.614635,torah_skips_2_s0,157550,157560
2,ויהוהמלהוא,-19.795911,torah_skips_2_s1,173289,173299
3,הויהיהאלוי,-19.900117,torah_skips_2_s0,157548,157558
4,כלהאלכלהוא,-20.108641,torah_skips_2_s0,54752,54762


In [68]:
df_10.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_10['skip_start_from'] = df_10['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_10

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,וימהוהותימ,-19.350161,s1,86927,86937
1,ויהיהאלויה,-19.614635,s0,157550,157560
2,ויהוהמלהוא,-19.795911,s1,173289,173299
3,הויהיהאלוי,-19.900117,s0,157548,157558
4,כלהאלכלהוא,-20.108641,s0,54752,54762
...,...,...,...,...,...
291385,שחאצשעעיעג,-55.623760,s1,48341,48351
291386,סאאאאשאיוט,-55.778467,s0,77534,77544
291387,דאאצשעבזיז,-56.181992,s1,457,467
291388,ההעחאושחאא,-56.285469,s1,147061,147071


In [69]:
from google.colab import files

df_10.to_csv('df_10.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_10.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [70]:
seq_probs_4_3_4 = {}

for i in range(len(torah_skips_2_s0) - 11):
  seq = torah_skips_2_s0[i:i + 4] + ' ' + torah_skips_2_s0[i + 4:i + 7] + ' ' + torah_skips_2_s0[i + 7:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_4_3_4[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2, i*2+11]

seq_probs_4_3_4 = dict(sorted(seq_probs_4_3_4.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_4_3_4))
#seq_probs_4_3_4

148798


In [71]:
for i in range(len(torah_skips_2_s1) - 11):
  seq = torah_skips_2_s1[i:i + 4] + ' ' + torah_skips_2_s1[i + 4:i + 7] + ' ' + torah_skips_2_s1[i + 7:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_4_3_4[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2+1, i*2+12]

seq_probs_4_3_4 = dict(sorted(seq_probs_4_3_4.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_4_3_4))
seq_probs_4_3_4

293823


{'וימי ואמ הוימ': [-20.9157127483038, 'torah_skips_2_s0', 214881, 214892],
 'ויהי האל ויהי': [-21.007206557510173, 'torah_skips_2_s0', 157550, 157561],
 'ליהמ ויה הויה': [-21.26777718906711, 'torah_skips_2_s0', 157534, 157545],
 'וימר וני לוימ': [-21.325388615844457, 'torah_skips_2_s0', 55127, 55138],
 'יוימ הוה ותימ': [-22.05955149040956, 'torah_skips_2_s0', 86925, 86936],
 'לכימ ויה אמשמ': [-22.219309008751463, 'torah_skips_2_s0', 97982, 97993],
 'אתוי והו לקדב': [-22.23787219598838, 'torah_skips_2_s0', 188306, 188317],
 'אויה והמ להוא': [-22.285387012860557, 'torah_skips_2_s0', 173287, 173298],
 'והמר הימ אמהו': [-22.371823534781914, 'torah_skips_2_s0', 126705, 126716],
 'המשר וימ ולעל': [-22.386039344551772, 'torah_skips_2_s0', 84070, 84081],
 'ויהו המל הואש': [-22.536678497679958, 'torah_skips_2_s0', 173289, 173300],
 'בימי ואמ הנימ': [-22.60001319567042, 'torah_skips_2_s0', 101355, 101366],
 'אלתמ אתע ואתו': [-22.673520023170468, 'torah_skips_2_s0', 260354, 260365],
 'והוי ויא אר

In [72]:
df_4_3_4 = pd.DataFrame.from_dict(seq_probs_4_3_4, orient='index').reset_index()

In [73]:
df_4_3_4.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_4_3_4['skip_start_from'] = df_4_3_4['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_4_3_4

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,וימי ואמ הוימ,-20.915713,s0,214881,214892
1,ויהי האל ויהי,-21.007207,s0,157550,157561
2,ליהמ ויה הויה,-21.267777,s0,157534,157545
3,וימר וני לוימ,-21.325389,s0,55127,55138
4,יוימ הוה ותימ,-22.059551,s0,86925,86936
...,...,...,...,...,...
293818,דאפמ שענ פעחא,-58.471181,s0,129369,129380
293819,חאצש עעי עגתע,-58.635491,s0,48343,48354
293820,אאלו דאה זחאא,-59.281813,s0,122904,122915
293821,שצשצ לאצ קהאא,-60.352882,s0,157197,157208


In [74]:
from google.colab import files

df_4_3_4.to_csv('df_4_3_4.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_4_3_4.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [75]:
seq_probs_3_3_5 = {}

for i in range(len(torah_skips_2_s0) - 11):
  seq = torah_skips_2_s0[i:i + 3] + ' ' + torah_skips_2_s0[i + 3:i + 6] + ' ' + torah_skips_2_s0[i + 6:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_3_3_5[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2, i*2+11]

seq_probs_3_3_5 = dict(sorted(seq_probs_3_3_5.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_3_3_5))
#seq_probs_3_3_5

148798


In [76]:
for i in range(len(torah_skips_2_s1) - 11):
  seq = torah_skips_2_s1[i:i + 3] + ' ' + torah_skips_2_s1[i + 3:i + 6] + ' ' + torah_skips_2_s1[i + 6:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_3_3_5[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2+1, i*2+12]

seq_probs_3_3_5 = dict(sorted(seq_probs_3_3_5.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_3_3_5))
seq_probs_3_3_5

293823


{'וימ הוה ותימי': [-21.15380859213986, 'torah_skips_2_s0', 86927, 86938],
 'ויה הוי היהאל': [-21.228722896607422, 'torah_skips_2_s0', 157542, 157553],
 'ויה והמ להואש': [-21.337881632966806, 'torah_skips_2_s0', 173289, 173300],
 'יהמ ויה הויהי': [-21.377736435448632, 'torah_skips_2_s0', 157536, 157547],
 'הוי היה אלויה': [-21.392342534630735, 'torah_skips_2_s0', 157548, 157559],
 'והו ואר הניהמ': [-21.96231900676115, 'torah_skips_2_s0', 204108, 204119],
 'אתמ הוה אשרבה': [-22.133804691050727, 'torah_skips_2_s0', 288207, 288218],
 'אלא וימ הימהש': [-22.2420804997386, 'torah_skips_2_s0', 96543, 96554],
 'וימ יוא מהוימ': [-22.71239400395598, 'torah_skips_2_s0', 214881, 214892],
 'אלו לימ והנוא': [-22.779331552376988, 'torah_skips_2_s0', 80511, 80522],
 'יהי ויה להיאל': [-22.824933398101038, 'torah_skips_2_s0', 31546, 31557],
 'ותמ הוי אשמהי': [-22.825321271582773, 'torah_skips_2_s0', 186562, 186573],
 'ישו ואת הוהימ': [-22.82860523748467, 'torah_skips_2_s0', 17534, 17545],
 'ארא איה יהוימ

In [77]:
df_3_3_5 = pd.DataFrame.from_dict(seq_probs_3_3_5, orient='index').reset_index()

In [78]:
df_3_3_5.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_3_3_5['skip_start_from'] = df_3_3_5['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_3_3_5

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,וימ הוה ותימי,-21.153809,s0,86927,86938
1,ויה הוי היהאל,-21.228723,s0,157542,157553
2,ויה והמ להואש,-21.337882,s0,173289,173300
3,יהמ ויה הויהי,-21.377736,s0,157536,157547
4,הוי היה אלויה,-21.392343,s0,157548,157559
...,...,...,...,...,...
293818,כשצ אאא רמארל,-57.656298,s0,26392,26403
293819,דאא רשג דאהחח,-58.153371,s0,162999,163010
293820,זחי חאע לחאאר,-58.851950,s0,153325,153336
293821,דאא צשע בזיזע,-60.408593,s0,457,468


In [79]:
from google.colab import files

df_3_3_5.to_csv('df_3_3_5.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_3_3_5.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>