<a href="https://colab.research.google.com/github/mannmoshe/text-recognition/blob/main/text_recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# based on:
# https://deeplearningcourses.com/c/data-science-natural-language-processing-in-python 
# https://www.udemy.com/data-science-natural-language-processing-in-python Section 7 Cipher Decryption

# Author: Moshe Mann

import numpy as np
np.seterr(divide = 'ignore') 

import string
import random
import requests
import pandas as pd

In [2]:
for l in range(1488, 1515):
  print(l - 1488, chr(l))

0 א
1 ב
2 ג
3 ד
4 ה
5 ו
6 ז
7 ח
8 ט
9 י
10 ך
11 כ
12 ל
13 ם
14 מ
15 ן
16 נ
17 ס
18 ע
19 ף
20 פ
21 ץ
22 צ
23 ק
24 ר
25 ש
26 ת


In [3]:
### the language model

# initialize Markov matrix
# MM: matrix for the bi-gram probabilites
M = np.ones((27, 27))

# initial state distribution
# MM: vector for the uni-gram probabilites
# MM: uni-gram probabilites calculated only by first letters of the words
pi = np.zeros(27)

# a function to update the Markov matrix
def update_transition(ch1, ch2):
  # ord('א') = 1488, ord('ב') = 1489, ...
  i = ord(ch1) - 1488 # 'א' will mapped to 0, 'ב' will mapped to 1 and so on
  j = ord(ch2) - 1488
  M[i,j] += 1

# a function to update the initial state distribution
def update_pi(ch):
  i = ord(ch) - 1488
  pi[i] += 1

# get the log-probability of a word / token
def get_word_prob(word):
  # print("word:", word)
  i = ord(word[0]) - 1488
  logp = np.log(pi[i])

  for ch in word[1:]:
    j = ord(ch) - 1488
    logp += np.log(M[i, j]) # update prob
    i = j # update i to j

  return logp

# get the probability of a sequence of words
def get_sequence_prob(words):
  # if input is a string, split into an array of tokens
  if type(words) == str:
    words = words.split()

  logp = 0
  for word in words:
    logp += get_word_prob(word)
  return logp

In [4]:
# with open('torah_text.txt', encoding='ISO-8859-8') as f:
#     torah_text = f.read()
req = requests.get("https://raw.githubusercontent.com/mannmoshe/text-recognition/main/torah_heb.txt")
req.encoding = 'ISO-8859-8'
torah_text = req.text

In [5]:
torah_text[:50]

'בראשית ברא אלהים את השמים ואת הארץ והארץ היתה תהו '

מספר הפעמים שהאות א מופיעה שלא בסוף המילה

In [6]:
torah_text.count('א') - torah_text.count('א ')

23506

In [7]:
torah_text.count('אב')

973

In [8]:
tokens = torah_text.split()

for token in tokens:
  # update the model

  # first letter
  ch0 = token[0]
  update_pi(ch0)

  # other letters
  for ch1 in token[1:]:
    update_transition(ch0, ch1)
    ch0 = ch1

pi_initial_count = pi.copy()
M_initial_count = M.copy()

# normalize the probabilities
pi /= pi.sum()
M /= M.sum(axis=1, keepdims=True)

first letter distribution:

In [9]:
pi_initial_count

array([13180.,  6167.,   532.,   549.,  8730., 13784.,   634.,  1073.,
         251.,  5588.,     0.,  3434.,  6868.,     0.,  5387.,     0.,
        1443.,   292.,  3988.,     0.,  1022.,     0.,   515.,   793.,
         857.,  2739.,  2138.])

The probabilties vector of first letters in word:

In [10]:
pi

array([0.16482417, 0.0771222 , 0.00665299, 0.00686559, 0.10917413,
       0.17237757, 0.00792857, 0.01341854, 0.00313891, 0.06988145,
       0.        , 0.04294432, 0.08588865, 0.        , 0.06736782,
       0.        , 0.01804562, 0.00365164, 0.04987244, 0.        ,
       0.01278075, 0.        , 0.0064404 , 0.00991696, 0.01071732,
       0.03425291, 0.02673703])

In [11]:
pi.sum()

1.0

In [12]:
M_initial_count[0]

array([2.000e+00, 9.740e+02, 1.600e+01, 5.360e+02, 9.610e+02, 1.111e+03,
       1.030e+02, 1.098e+03, 4.000e+00, 1.073e+03, 1.010e+02, 4.900e+02,
       4.090e+03, 3.880e+02, 1.688e+03, 1.060e+02, 5.920e+02, 1.140e+02,
       5.500e+01, 5.300e+01, 1.450e+02, 2.000e+00, 5.600e+01, 3.200e+01,
       1.460e+03, 2.740e+03, 5.543e+03])

In [13]:
M_initial_count[0].sum()

23533.0

The probabilties matrix of bi-gram where the first letter is: א

In [14]:
M[0]

array([8.49870395e-05, 4.13886882e-02, 6.79896316e-04, 2.27765266e-02,
       4.08362725e-02, 4.72103004e-02, 4.37683253e-03, 4.66578847e-02,
       1.69974079e-04, 4.55955467e-02, 4.29184549e-03, 2.08218247e-02,
       1.73798496e-01, 1.64874857e-02, 7.17290613e-02, 4.50431309e-03,
       2.51561637e-02, 4.84426125e-03, 2.33714359e-03, 2.25215655e-03,
       6.16156036e-03, 8.49870395e-05, 2.37963711e-03, 1.35979263e-03,
       6.20405388e-02, 1.16432244e-01, 2.35541580e-01])

In [15]:
M.sum(axis=1)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [16]:
get_sequence_prob('וידבר יהוה אל משה לאמר')

-38.29143887391395

In [17]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['סיפזף בקכי נג עןן ךשןו', 'מוןיע וףףל בן מףח משכז', 'ףשחדכ סינצ חט שיב חיהך', 'זרימש נוךח םי פכע תהיו', 'תגוטה טזיא זח גרצ ץםצט', 'צטחץט פרשר כת רןכ בחפג', 'וגזץל רךפי תף שפי יןבס', 'דצזףח כקלח קך רםד נטשנ', 'טץךלה טססט תח זנג תתבא', 'ךנפםר יףקפ ץכ כבז תךאנ']
[-inf, -66.29778985479442, -inf, -inf, -inf, -69.56070119495054, -71.86383833615629, -89.23945691763922, -80.2140267685829, -inf]
average_score: -inf


371
average_score: -73.31671746188324


get_sequence_prob('וידבר יהוה אל משה לאמר') is bigger than average score of a sentence with a same structure.

In [18]:
get_sequence_prob('ואהבתם את הגר')

-27.0614837117811

In [19]:
get_sequence_prob('ואהבת לרעך כמוך')

-42.744228980058665

In [20]:
get_sequence_prob('וילך משה ויאמר אל עמו הנה פרעה משלח אתכם ואת טפכם')

-101.24130268026462

In [21]:
letters = [chr(l) for l in range(1488, 1515)]

random_texts = []
random_scores = []
for i in range(1000):
  random_text = ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(5)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(2)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(3)) + ' ' \
                 + ''.join(random.choice(letters) for i in range(4)) 
  #print (random_text)
  random_scores.append(get_sequence_prob(random_text))
  random_texts.append(random_text)

print(len(random_scores))
print(random_texts[:10])
print(random_scores[:10])
print('average_score:', sum(random_scores) / len(random_scores))
print("\n")
fixed_random_scores = [value for value in random_scores if value != float('-inf')]
print(len(fixed_random_scores))
#print(fixed_random_scores)
print('average_score:', sum(fixed_random_scores) / len(fixed_random_scores))

1000
['תעןר עלך תפןחם שך דחמ עצח קהטי צחגח בץןת חטל יקטם', 'טרםפ עץצ קרץנז בא סךה פוי חחזע כבדף חההצ החס ץזןך', 'גצץן צגב תשסלן אפ כדס הפע זונכ חנלנ הירק ערצ חשאע', 'ןסקם תץו עןנקס גפ הםד לץץ סנרי מקבס ץתבף בום שךךכ', 'קעףפ אפד שזוזנ ץז זוכ ךןג הסץי טכמף יעןת למו כךקב', 'קושו יהו למכתר ןן נץג כלס ץדמץ מלשי וםתר גרם סאדז', 'ועדט בחנ פןלככ נת זךי גלח לוצג ייגן צנןו לשט יוכר', 'חטחץ זחה חשךיה כע קגם דהץ אחםם אולש ןיגת ךשע ולפן', 'שמזצ םשפ יפץדז הם לבף אזף צסבץ ךרבט וףקא סלש ץקוי', 'םראס ןוט סניסס תז עיו יכש שךלן וששל ץםנט םחצ צעסק']
[-151.09940887255425, -inf, -166.3051711840879, -inf, -inf, -inf, -148.82848298307516, -inf, -inf, -inf]
average_score: -inf


106
average_score: -160.63707152843685


In [22]:
torah_text[:50]

'בראשית ברא אלהים את השמים ואת הארץ והארץ היתה תהו '

In [23]:
torah_text_no_spaces = torah_text.strip().replace(" ", "").replace("  ", "")

In [24]:
print(torah_text_no_spaces[:50]+'\n')
print(torah_text_no_spaces[-50:]+'\n')
print(len(torah_text_no_spaces))

בראשיתבראאלהיםאתהשמיםואתהארץוהארץהיתהתהוובהווחשךעל

רצוולכלהידהחזקהולכלהמוראהגדולאשרעשהמשהלעיניכלישראל

304805


Check if spaces affect the probability:

In [25]:
get_sequence_prob('אבגד הוז חטיכ')

-33.55682673600577

In [26]:
get_sequence_prob('אבג דהו זחטיכ')

-38.65712258210688

In [27]:
torah_skips_2_s0 = ''
for i in range(0, len(torah_text_no_spaces), 2):
  torah_skips_2_s0 += torah_text_no_spaces[i]

In [28]:
torah_skips_2_s1 = ''
for i in range(1, len(torah_text_no_spaces), 2):
  torah_skips_2_s1 += torah_text_no_spaces[i]

In [29]:
torah_skips_2_s0[:100]

'באיבאליאהמםאהרואץיההוהושעפיהםרחלימחתלנהיוארליייווהאריאליאהוכטבידאהםיהוויהשוקאלילויםלשקאיהיירוהברואדי'

In [30]:
torah_skips_2_s1[:100]

'רשתראהםתשיותאץהרהתתובוחךלנתווואהםרפעפימםימאהםהארייווראהםתאריוובלליבןארבןחךיראהםארווחךרללוהעבייקיםחוא'

Trying recognize true text 20,15,10 characters long in skips of 2.

In [31]:
words_probs_20 = {}

for i in range(len(torah_skips_2_s0) - 20):
  word = torah_skips_2_s0[i:i + 20]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_20[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+20]

words_probs_20 = dict(sorted(words_probs_20.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_20))

141166


In [32]:
for i in range(len(torah_skips_2_s1) - 20):
  word = torah_skips_2_s1[i:i + 20]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_20[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+21]

words_probs_20 = dict(sorted(words_probs_20.items(), key=lambda item: item[1], reverse=True))
words_probs_20

{'ליהמויההויהיהאלויהיא': [-44.080390478392566,
  'torah_skips_2_s0',
  157534,
  157554],
 'המויההויהיהאלויהיארא': [-44.60303419247533,
  'torah_skips_2_s0',
  157538,
  157558],
 'יהמויההויהיהאלויהיאר': [-45.021464164451196,
  'torah_skips_2_s0',
  157536,
  157556],
 'ליאתויוהולקדבישאליאת': [-45.41255873375112,
  'torah_skips_2_s0',
  188302,
  188322],
 'אהויותתיויעלארבאלניו': [-46.76022612394564,
  'torah_skips_2_s1',
  119783,
  119803],
 'הדליהמויההויהיהאלויה': [-46.88423293791052,
  'torah_skips_2_s0',
  157530,
  157550],
 'אתויוהולקדבישאליאתמן': [-46.960840419463366,
  'torah_skips_2_s0',
  188306,
  188326],
 'יאתויוהולקדבישאליאתמ': [-47.297511668115966,
  'torah_skips_2_s0',
  188304,
  188324],
 'דליהמויההויהיהאלויהי': [-47.31333551426101,
  'torah_skips_2_s0',
  157532,
  157552],
 'אתעואתובתעשהובואתשרו': [-47.383628769784714,
  'torah_skips_2_s0',
  260362,
  260382],
 'אשחליאתויוהולקדבישאל': [-47.40586724870683,
  'torah_skips_2_s0',
  188296,
  188316],
 'ויהעואלארהלכימ

In [33]:
print(len(words_probs_20))

281088


In [34]:
words_probs_15 = {}

for i in range(len(torah_skips_2_s0) - 15):
  word = torah_skips_2_s0[i:i + 15]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_15[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+15]

words_probs_15 = dict(sorted(words_probs_15.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_15))

140526


In [35]:
for i in range(len(torah_skips_2_s1) - 15):
  word = torah_skips_2_s1[i:i + 15]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_15[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+16]

words_probs_15 = dict(sorted(words_probs_15.items(), key=lambda item: item[1], reverse=True))
words_probs_15

{'ויההויהיהאלויהי': [-31.04311741437636, 'torah_skips_2_s0', 157542, 157557],
 'הויהיהאלויהיארא': [-31.366486600140906, 'torah_skips_2_s0', 157548, 157563],
 'מויההויהיהאלויה': [-32.72823744075662, 'torah_skips_2_s0', 157540, 157555],
 'ליהמויההויהיהאל': [-32.99642270417015, 'torah_skips_2_s0', 157534, 157549],
 'המויההויהיהאלוי': [-33.13893741453231, 'torah_skips_2_s0', 157538, 157553],
 'יההויהיהאלויהיא': [-33.21636555022013, 'torah_skips_2_s0', 157544, 157559],
 'אתויוהולקדבישאל': [-33.34842869813899, 'torah_skips_2_s0', 188306, 188321],
 'ליאתויוהולקדביש': [-33.424862221158286, 'torah_skips_2_s0', 188302, 188317],
 'ההויהיהאלויהיאר': [-33.66542729060752, 'torah_skips_2_s0', 157546, 157561],
 'יהמויההויהיהאלו': [-33.802131764855886, 'torah_skips_2_s0', 157536, 157551],
 'ויותתיויעלארבאל': [-34.157021400903716, 'torah_skips_2_s1', 119787, 119802],
 'יואשרוהיאשושהוי': [-34.16608163030187, 'torah_skips_2_s0', 109278, 109293],
 'אתויאשאשהישביות': [-34.30572019775212, 'torah_skips_2_s1',

In [36]:
words_probs_10 = {}

for i in range(len(torah_skips_2_s0) - 10):
  word = torah_skips_2_s0[i:i + 10]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_10[word] = [get_sequence_prob(word), 'torah_skips_2_s0', i*2, i*2+10]

words_probs_10 = dict(sorted(words_probs_10.items(), key=lambda item: item[1], reverse=True))
print(len(words_probs_10))

138343


In [37]:
for i in range(len(torah_skips_2_s1) - 10):
  word = torah_skips_2_s1[i:i + 10]
  if get_sequence_prob(word) != float('-inf'):
    words_probs_10[word] = [get_sequence_prob(word), 'torah_skips_2_s1', i*2+1, i*2+11]

words_probs_10 = dict(sorted(words_probs_10.items(), key=lambda item: item[1], reverse=True))
words_probs_10

{'ויהיהאלויה': [-19.61681492093077, 'torah_skips_2_s0', 157550, 157560],
 'הויהיהאלוי': [-19.902389822197883, 'torah_skips_2_s0', 157548, 157558],
 'ויהוהמלהוא': [-20.482852073182308, 'torah_skips_2_s1', 173289, 173299],
 'יהיהאלויהי': [-20.800810962798323, 'torah_skips_2_s0', 157552, 157562],
 'היויאלהיוי': [-20.83320101181823, 'torah_skips_2_s0', 31956, 31966],
 'להואהאשרים': [-21.041452091205194, 'torah_skips_2_s0', 250968, 250978],
 'ויאתוישיני': [-21.08352422131772, 'torah_skips_2_s1', 23129, 23139],
 'ויההויהיהא': [-21.1473631617903, 'torah_skips_2_s0', 157542, 157552],
 'אברביוהוים': [-21.217753953515817, 'torah_skips_2_s1', 161739, 161749],
 'ליאתויוהול': [-21.27449144316211, 'torah_skips_2_s0', 188302, 188312],
 'אלאתותבנהו': [-21.27774931501384, 'torah_skips_2_s1', 180207, 180217],
 'ויויהחתואש': [-21.28037359550517, 'torah_skips_2_s0', 111968, 111978],
 'וימהוהותים': [-21.362448182518847, 'torah_skips_2_s1', 86927, 86937],
 'אתויוהולקד': [-21.38212408462773, 'torah_skips_2_s

In [38]:
print(len(words_probs_10))

272160


In [39]:
df_10 = pd.DataFrame.from_dict(words_probs_10, orient='index').reset_index()

In [40]:
df_10.head()

Unnamed: 0,index,0,1,2,3
0,ויהיהאלויה,-19.616815,torah_skips_2_s0,157550,157560
1,הויהיהאלוי,-19.90239,torah_skips_2_s0,157548,157558
2,ויהוהמלהוא,-20.482852,torah_skips_2_s1,173289,173299
3,יהיהאלויהי,-20.800811,torah_skips_2_s0,157552,157562
4,היויאלהיוי,-20.833201,torah_skips_2_s0,31956,31966


In [41]:
df_10.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_10['skip_start_from'] = df_10['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_10

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,ויהיהאלויה,-19.616815,s0,157550,157560
1,הויהיהאלוי,-19.902390,s0,157548,157558
2,ויהוהמלהוא,-20.482852,s1,173289,173299
3,יהיהאלויהי,-20.800811,s0,157552,157562
4,היויאלהיוי,-20.833201,s0,31956,31966
...,...,...,...,...,...
272155,אאצכעוחאאי,-56.380136,s1,69671,69681
272156,חאצשעעיעגת,-56.653045,s1,48343,48353
272157,אאהזחאגכמם,-57.753385,s1,174457,174467
272158,אאאץעראאץל,-59.978978,s0,242020,242030


In [42]:
from google.colab import files

df_10.to_csv('df_10.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_10.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [43]:
seq_probs_4_3_4 = {}

for i in range(len(torah_skips_2_s0) - 11):
  seq = torah_skips_2_s0[i:i + 4] + ' ' + torah_skips_2_s0[i + 4:i + 7] + ' ' + torah_skips_2_s0[i + 7:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_4_3_4[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2, i*2+11]

seq_probs_4_3_4 = dict(sorted(seq_probs_4_3_4.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_4_3_4))
#seq_probs_4_3_4

121745


In [44]:
for i in range(len(torah_skips_2_s1) - 11):
  seq = torah_skips_2_s1[i:i + 4] + ' ' + torah_skips_2_s1[i + 4:i + 7] + ' ' + torah_skips_2_s1[i + 7:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_4_3_4[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2+1, i*2+12]

seq_probs_4_3_4 = dict(sorted(seq_probs_4_3_4.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_4_3_4))
seq_probs_4_3_4

239869


{'ויהי האל ויהי': [-21.009150050478798, 'torah_skips_2_s0', 157550, 157561],
 'ליהמ ויה הויה': [-21.95426659546559, 'torah_skips_2_s0', 157534, 157545],
 'אתוי והו לקדב': [-22.241857901050455, 'torah_skips_2_s0', 188306, 188317],
 'והוי ויא אראר': [-22.68778107426223, 'torah_skips_2_s0', 27833, 27844],
 'ויהי ארא אליב': [-22.827655243306175, 'torah_skips_2_s0', 157564, 157575],
 'ליהי ויה להיא': [-22.91593059126926, 'torah_skips_2_s0', 31544, 31555],
 'אויה והמ להוא': [-22.971899294678032, 'torah_skips_2_s0', 173287, 173298],
 'מויה הוי היהא': [-23.11474784361645, 'torah_skips_2_s0', 157540, 157551],
 'אתעי ואי ואתי': [-23.185675645003936, 'torah_skips_2_s0', 294281, 294292],
 'הויה יהא לויה': [-23.20413122101496, 'torah_skips_2_s0', 157548, 157559],
 'ויהו המל הואש': [-23.223285730864323, 'torah_skips_2_s0', 173289, 173300],
 'ואשר והי אשוש': [-23.316607616925758, 'torah_skips_2_s0', 109280, 109291],
 'היוי אלה יויא': [-23.348211959105885, 'torah_skips_2_s0', 31956, 31967],
 'היני אתו

In [45]:
df_4_3_4 = pd.DataFrame.from_dict(seq_probs_4_3_4, orient='index').reset_index()

In [46]:
df_4_3_4.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_4_3_4['skip_start_from'] = df_4_3_4['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_4_3_4

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,ויהי האל ויהי,-21.009150,s0,157550,157561
1,ליהמ ויה הויה,-21.954267,s0,157534,157545
2,אתוי והו לקדב,-22.241858,s0,188306,188317
3,והוי ויא אראר,-22.687781,s0,27833,27844
4,ויהי ארא אליב,-22.827655,s0,157564,157575
...,...,...,...,...,...
239864,אארל מכת אאאץ,-59.636435,s0,163115,163126
239865,דאפם שען פעחא,-59.833211,s0,129369,129380
239866,ללשץ שץל אץאא,-59.866799,s0,157260,157271
239867,רלטב דיי קאאץ,-60.014834,s0,304037,304048


In [47]:
from google.colab import files

df_4_3_4.to_csv('df_4_3_4.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_4_3_4.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [48]:
seq_probs_3_3_5 = {}

for i in range(len(torah_skips_2_s0) - 11):
  seq = torah_skips_2_s0[i:i + 3] + ' ' + torah_skips_2_s0[i + 3:i + 6] + ' ' + torah_skips_2_s0[i + 6:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_3_3_5[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2, i*2+11]

seq_probs_3_3_5 = dict(sorted(seq_probs_3_3_5.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_3_3_5))
#seq_probs_3_3_5

121731


In [49]:
for i in range(len(torah_skips_2_s1) - 11):
  seq = torah_skips_2_s1[i:i + 3] + ' ' + torah_skips_2_s1[i + 3:i + 6] + ' ' + torah_skips_2_s1[i + 6:i + 11]
  if get_sequence_prob(seq) != float('-inf'):
    seq_probs_3_3_5[seq] = [get_sequence_prob(seq), 'torah_skips_2_s0', i*2+1, i*2+12]

seq_probs_3_3_5 = dict(sorted(seq_probs_3_3_5.items(), key=lambda item: item[1], reverse=True))
print(len(seq_probs_3_3_5))
seq_probs_3_3_5

239958


{'ויה הוי היהאל': [-21.23066638957605, 'torah_skips_2_s0', 157542, 157553],
 'הוי היה אלויה': [-21.394318694596574, 'torah_skips_2_s0', 157548, 157559],
 'ויה והמ להואש': [-22.02439391478428, 'torah_skips_2_s0', 173289, 173300],
 'יהמ ויה הויהי': [-22.0641931748499, 'torah_skips_2_s0', 157536, 157547],
 'יהי ויה להיאל': [-22.826895742993432, 'torah_skips_2_s0', 31546, 31557],
 'ויה יהא לויהי': [-22.982614881917705, 'torah_skips_2_s0', 157550, 157561],
 'הוי ויא ארארי': [-22.993833643096327, 'torah_skips_2_s0', 27835, 27846],
 'ארא איה יהוים': [-23.025927900290608, 'torah_skips_2_s0', 22468, 22479],
 'והו ואר הניהמ': [-23.105695327416395, 'torah_skips_2_s0', 204108, 204119],
 'יהי האל ויהיא': [-23.182398186322565, 'torah_skips_2_s0', 157552, 157563],
 'לאש יהא אשרוי': [-23.22712164642465, 'torah_skips_2_s0', 110378, 110389],
 'ותם הוי אשמהי': [-23.31934182534627, 'torah_skips_2_s0', 186562, 186573],
 'אלא ביש אמרוי': [-23.331390028520637, 'torah_skips_2_s0', 93939, 93950],
 'ואר ויו אמה

In [50]:
df_3_3_5 = pd.DataFrame.from_dict(seq_probs_3_3_5, orient='index').reset_index()

In [51]:
df_3_3_5.columns = ['string', 'log_prob', 'skip_start_from', 'i_start', 'i_end']
df_3_3_5['skip_start_from'] = df_3_3_5['skip_start_from'].map({'torah_skips_2_s0': 's0', 'torah_skips_2_s1': 's1'})
df_3_3_5

Unnamed: 0,string,log_prob,skip_start_from,i_start,i_end
0,ויה הוי היהאל,-21.230666,s0,157542,157553
1,הוי היה אלויה,-21.394319,s0,157548,157559
2,ויה והמ להואש,-22.024394,s0,173289,173300
3,יהמ ויה הויהי,-22.064193,s0,157536,157547
4,יהי ויה להיאל,-22.826896,s0,31546,31557
...,...,...,...,...,...
239953,כשצ אאא רםארל,-58.436474,s0,26392,26403
239954,אאר למכ תאאאץ,-58.595107,s0,163115,163126
239955,זחי חאע לחאאר,-58.857738,s0,153325,153336
239956,אאה זחא גכמםו,-59.761623,s0,174457,174468


In [52]:
from google.colab import files

df_3_3_5.to_csv('df_3_3_5.csv', index=False ,encoding = 'utf-8-sig')
files.download('df_3_3_5.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>