# Goal: generate a dataset for GPT (BHSA)


In [1]:
from tfob import TFOb, BHSA, DSS

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


In [2]:
import pandas as pd
from itertools import chain

### 1.Create a list with the occurrences from BHSA

#### 1.1 List of wanted motion verbs

In [3]:
motion_verbs = ["BW>[", "HLK[", "CWB[", "<LH[", "CLX[", "JY>[", 
                "JRD[", "<BR[", "NGC[", "QRB[", "NWS[", "CLK[", 
                ">SP[", "NPL[", "QBY[", "LQX[", "SWR[", "GLH[",
                "QHL["]

#### 1.2 List of occurrences from MT Isaiah containing a motion verbs

In [4]:
verbs_bhsa = TFOb.all("word", BHSA).filter(book="Isaiah").filter_in(lex=motion_verbs)

In [5]:
# TESTING AREA

#verbs_bhsa
#TFOb(215297, BHSA).to_verses.text
TFOb(212418, BHSA).to_verses.pretty()

#### 1.3 Generate a list verses from the list of verbs

In [6]:
verses_bhsa = []

for verb in chain(verbs_bhsa):
    verse = verb.to_verses.to_words
    verses_bhsa.append(verse)

In [7]:
print(len(verses_bhsa))

571


### Create the dataset: all occurrences of the verbs we are interested in, with or without complements

We need to have both occurrences with and without complements so the model does not believe that there is **always** a complement. This comes handy when we want the model to analyse the verses from the Dead Sea Scrolls.

In [8]:
# Create a dataset with the occurrences

def find_complements(verb):
    """Find the complement of a verb in BHSA. If no match, returns None"""
    complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
    return complements


items = [] # create an empty list to store all the information for each occ.

       
for verb in chain(verbs_bhsa):
    complements = find_complements(verb)
    
    if complements:
        complements = [complement.text for complement in complements]
    else:
        complements = [""]

    for complement in complements:
            
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0],
            "book": verb.book[0], 
            "chapter": verb.chapter[0], 
            "verse_num": verb.verse[0],
            "verse_heb": verb.to_verses.text,
            "verb_heb": verb.text,
            "clause_heb": verb.to_clauses.text,
            "cmpl_heb" : complement,
        }
        items.append(item)

In [9]:
df = pd.DataFrame(items).fillna("")

In [10]:
df

Unnamed: 0,verb_id,book,chapter,verse_num,verse_heb,verb_heb,clause_heb,cmpl_heb
0,212256,Isaiah,1,12,כִּ֣י תָבֹ֔אוּ לֵרָאֹ֖ות פָּנָ֑י מִי־בִקֵּ֥שׁ ...,תָבֹ֔אוּ,כִּ֣י תָבֹ֔אוּ,
1,212269,Isaiah,1,13,לֹ֣א תֹוסִ֗יפוּ הָבִיא֙ מִנְחַת־שָׁ֔וְא קְטֹ֧ר...,הָבִיא֙,הָבִיא֙ מִנְחַת־שָׁ֔וְא,
2,212315,Isaiah,1,16,רַחֲצוּ֙ הִזַּכּ֔וּ הָסִ֛ירוּ רֹ֥עַ מַעַלְלֵיכ...,הָסִ֛ירוּ,הָסִ֛ירוּ רֹ֥עַ מַעַלְלֵיכֶ֖ם מִנֶּ֣גֶד עֵינָ֑י,מִנֶּ֣גֶד עֵינָ֑י
3,212333,Isaiah,1,18,לְכוּ־נָ֛א וְנִוָּֽכְחָ֖ה יֹאמַ֣ר יְהוָ֑ה אִם־...,לְכוּ־,לְכוּ־נָ֛א,
4,212418,Isaiah,1,23,שָׂרַ֣יִךְ סֹורְרִ֗ים וְחַבְרֵי֙ גַּנָּבִ֔ים כ...,יָבֹ֥וא,וְרִ֥יב אַלְמָנָ֖ה לֹֽא־יָבֹ֥וא אֲלֵיהֶֽם׃ פ,אֲלֵיהֶֽם׃ פ
...,...,...,...,...,...,...,...,...
573,234909,Isaiah,66,20,וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִכָּל־הַגֹּויִ...,יָבִיאוּ֩,כַּאֲשֶׁ֣ר יָבִיאוּ֩ בְנֵ֨י יִשְׂרָאֵ֧ל אֶת־הַ...,בֵּ֥ית יְהוָֽה׃
574,234923,Isaiah,66,21,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם ...,אֶקַּ֛ח,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם,גַם־מֵהֶ֥ם
575,234923,Isaiah,66,21,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם ...,אֶקַּ֛ח,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם,לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם
576,234970,Isaiah,66,23,וְהָיָ֗ה מִֽדֵּי־חֹ֨דֶשׁ֙ בְּחָדְשֹׁ֔ו וּמִדֵּ...,יָבֹ֧וא,מִֽדֵּי־חֹ֨דֶשׁ֙ בְּחָדְשֹׁ֔ו וּמִדֵּ֥י שַׁבָּ...,


In [11]:
df[(df.cmpl_heb != "")]

Unnamed: 0,verb_id,book,chapter,verse_num,verse_heb,verb_heb,clause_heb,cmpl_heb
2,212315,Isaiah,1,16,רַחֲצוּ֙ הִזַּכּ֔וּ הָסִ֛ירוּ רֹ֥עַ מַעַלְלֵיכ...,הָסִ֛ירוּ,הָסִ֛ירוּ רֹ֥עַ מַעַלְלֵיכֶ֖ם מִנֶּ֣גֶד עֵינָ֑י,מִנֶּ֣גֶד עֵינָ֑י
4,212418,Isaiah,1,23,שָׂרַ֣יִךְ סֹורְרִ֗ים וְחַבְרֵי֙ גַּנָּבִ֔ים כ...,יָבֹ֥וא,וְרִ֥יב אַלְמָנָ֖ה לֹֽא־יָבֹ֥וא אֲלֵיהֶֽם׃ פ,אֲלֵיהֶֽם׃ פ
5,212437,Isaiah,1,25,וְאָשִׁ֤יבָה יָדִי֙ עָלַ֔יִךְ וְאֶצְרֹ֥ף כַּבּ...,אָשִׁ֤יבָה,וְאָשִׁ֤יבָה יָדִי֙ עָלַ֔יִךְ,עָלַ֔יִךְ
11,212577,Isaiah,2,3,וְֽהָלְכ֞וּ עַמִּ֣ים רַבִּ֗ים וְאָמְרוּ֙ לְכ֣ו...,נַעֲלֶ֣ה,וְנַעֲלֶ֣ה אֶל־הַר־יְהוָ֗ה אֶל־בֵּית֙ אֱלֹהֵ֣י...,אֶל־הַר־יְהוָ֗ה אֶל־בֵּית֙ אֱלֹהֵ֣י יַעֲקֹ֔ב
12,212590,Isaiah,2,3,וְֽהָלְכ֞וּ עַמִּ֣ים רַבִּ֗ים וְאָמְרוּ֙ לְכ֣ו...,נֵלְכָ֖ה,וְנֵלְכָ֖ה בְּאֹרְחֹתָ֑יו,בְּאֹרְחֹתָ֑יו
...,...,...,...,...,...,...,...,...
571,234834,Isaiah,66,19,וְשַׂמְתִּ֨י בָהֶ֜ם אֹ֗ות וְשִׁלַּחְתִּ֣י מֵהֶ...,שִׁלַּחְתִּ֣י,וְשִׁלַּחְתִּ֣י מֵהֶ֣ם׀ פְּ֠לֵיטִים אֶֽל־הַגֹּ...,אֶֽל־הַגֹּויִ֞ם תַּרְשִׁ֨ישׁ פּ֥וּל וְל֛וּד מֹ...
572,234871,Isaiah,66,20,וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִכָּל־הַגֹּויִ...,הֵבִ֣יאוּ,וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִכָּל־הַגֹּויִ...,עַ֣ל הַ֥ר קָדְשִׁ֛י יְרוּשָׁלִַ֖ם
573,234909,Isaiah,66,20,וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִכָּל־הַגֹּויִ...,יָבִיאוּ֩,כַּאֲשֶׁ֣ר יָבִיאוּ֩ בְנֵ֨י יִשְׂרָאֵ֧ל אֶת־הַ...,בֵּ֥ית יְהוָֽה׃
574,234923,Isaiah,66,21,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם ...,אֶקַּ֛ח,וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים לַלְוִיִּ֖ם,גַם־מֵהֶ֥ם


In [12]:
df.to_csv("gpt_bhsa_df.csv")

### Handle the verses where the same verb occurs several times

In [13]:
# Function which takes a verb and returns a number indicating what occ it is in the verse and None if it is the only occ

def verb_occurrence(verb):
    """Takes a verb and check if the verb occurs more than one time in a verse. 
    If it occurs only once, returns None. Else, returns a number n (occurrence n in the verse)"""
    
    verbs = verb.to_verses.to_words.filter(lex=verb.lex[0])
    number_of_occ = 0
    for vrb in verbs:
        if vrb.to_verses == verb.to_verses and vrb.text == verb.text:
            number_of_occ += 1
            if vrb == verb:
                occurrence = number_of_occ
    if number_of_occ > 1:
        return occurrence

In [14]:
#for verb in verbs_bhsa:
    #occurrence = verb_occurrence(verb)
    #if occurrence > 1:
        #break

In [15]:
TFOb(212269, BHSA).text

'הָבִיא֙ '

In [54]:
TFOb(212333, BHSA).to_verses.pretty()

In [17]:
print(verb.book[0], verb.chapter[0], verb.verse[0])
print(verb.to_verses.text)
print(verb.to_clauses.text)
print(verb.text)

Isaiah 66 24
וְיָצְא֣וּ וְרָא֔וּ בְּפִגְרֵי֙ הָאֲנָשִׁ֔ים הַפֹּשְׁעִ֖ים בִּ֑י כִּ֣י תֹולַעְתָּ֞ם לֹ֣א תָמ֗וּת וְאִשָּׁם֙ לֹ֣א תִכְבֶּ֔ה וְהָי֥וּ דֵרָאֹ֖ון לְכָל־בָּשָֽׂר׃ 
וְיָצְא֣וּ 
יָצְא֣וּ 


In [18]:
verb = TFOb(223763, BHSA)
print(verb.book[0], verb.chapter[0], verb.verse[0])
print(verb.to_verses.text)
print(verb.to_clauses.text)
print(verb.text)

Isaiah 36 2
וַיִּשְׁלַ֣ח מֶֽלֶךְ־אַשּׁ֣וּר׀ אֶת־רַב־שָׁקֵ֨ה מִלָּכִ֧ישׁ יְרוּשָׁלְַ֛מָה אֶל־הַמֶּ֥לֶךְ חִזְקִיָּ֖הוּ בְּחֵ֣יל כָּבֵ֑ד וַֽיַּעֲמֹ֗ד בִּתְעָלַת֙ הַבְּרֵכָ֣ה הָעֶלְיֹונָ֔ה בִּמְסִלַּ֖ת שְׂדֵ֥ה כֹובֵֽס׃ 
וַיִּשְׁלַ֣ח מֶֽלֶךְ־אַשּׁ֣וּר׀ אֶת־רַב־שָׁקֵ֨ה מִלָּכִ֧ישׁ יְרוּשָׁלְַ֛מָה אֶל־הַמֶּ֥לֶךְ חִזְקִיָּ֖הוּ בְּחֵ֣יל כָּבֵ֑ד 
יִּשְׁלַ֣ח 


In [19]:
verb2 = TFOb(231302, BHSA)
print(verb2.book[0], verb2.chapter[0], verb2.verse[0])
print(verb2.to_verses.text)
print(verb2.to_clauses.text)
print(verb2.text)
print(verb2.g_cons_utf8[0])

Isaiah 55 13
תַּ֤חַת הַֽנַּעֲצוּץ֙ יַעֲלֶ֣ה בְרֹ֔ושׁ וְתַ֥חַת הַסִּרְפַּ֖ד יַעֲלֶ֣ה הֲדַ֑ס וְהָיָ֤ה לַֽיהוָה֙ לְשֵׁ֔ם לְאֹ֥ות עֹולָ֖ם לֹ֥א יִכָּרֵֽת׃ ס 
וְתַ֥חַת הַסִּרְפַּ֖ד יַעֲלֶ֣ה הֲדַ֑ס 
יַעֲלֶ֣ה 
יעלה


In [20]:
questions = []
verb_ids = []

for verb in verbs_bhsa:
    verb_id = verb.ids[0]
    verse_heb = verb.to_verses.text
    verb_heb = verb.text
    occurrence = verb_occurrence(verb)
    
    if occurrence:
        occ_indication = f" (occurrence number {occurrence})"
    else:
        occ_indication = ""
    
    question = f"The verse is: {verse_heb}. The motion verb is: {verb_heb}{occ_indication}."
            
    questions.append(question)
    verb_ids.append(verb_id)

In [46]:
# Examples generator


verbs_ex = []
verb_id_ex = []
clauses_ex = []
cmpls_ex = []
objs_ex = []
subjs_ex = []
questions_ex = []

for verb in verbs_bhsa:
    verb_id = verb.ids[0]
    verse_heb = f"The verse is {verb.to_verses.text}."
    verb_heb = f"The verb I am interested in is {verb.text}."
    clause_heb = f"The clause of this verb is {verb.to_clauses.text}. "

    
    cmpl = verb.to_clauses.to_phrases.filter(function='Cmpl').text
    if cmpl != "":
        cmpl = f"The complement of the verb is {verb.to_clauses.to_phrases.filter(function='Cmpl').text}. "
    else:
        cmpl = f"There is no complement to the verb in this clause. "
        
    subj = verb.to_clauses.to_phrases.filter(function='Subj').text
    if subj != "":
        subj = f"The subject of the verb is {subj}. "
    else:
        subj = f"There is no explicit subject to the verb in this clause. "
        
    objc = verb.to_clauses.to_phrases.filter(function='Objc').text
    if objc:
        objc = f"The object to the verb is {objc}. "
    else:
        objc = f"There is no object to the verb in this clause. "
    
    question_ex = verse_heb + verb_heb + clause_heb + subj + objc + cmpl
    
    verbs_ex.append(verb)
    verb_id_ex.append(verb_id)
    clauses_ex.append(clause_heb)
    cmpls_ex.append(cmpl)
    objs_ex.append(objc)
    subjs_ex.append(subj)
    questions_ex.append(question_ex)

In [48]:
questions_dataset = pd.DataFrame({"Verb_ID": verb_id_ex, "Questions": questions_ex, "Complements": cmpls_ex, "Objects": objs_ex, "Subjects": subjs_ex})

In [49]:
questions_dataset

Unnamed: 0,Verb_ID,Questions,Complements,Objects,Subjects
0,212256,The verse is כִּ֣י תָבֹ֔אוּ לֵרָאֹ֖ות פָּנָ֑י ...,There is no complement to the verb in this cla...,There is no object to the verb in this clause.,There is no explicit subject to the verb in th...
1,212269,The verse is לֹ֣א תֹוסִ֗יפוּ הָבִיא֙ מִנְחַת־ש...,There is no complement to the verb in this cla...,The object to the verb is מִנְחַת־שָׁ֔וְא .,There is no explicit subject to the verb in th...
2,212315,The verse is רַחֲצוּ֙ הִזַּכּ֔וּ הָסִ֛ירוּ רֹ֥...,The complement of the verb is מִנֶּ֣גֶד עֵינָ֑...,The object to the verb is רֹ֥עַ מַעַלְלֵיכֶ֖ם .,There is no explicit subject to the verb in th...
3,212333,The verse is לְכוּ־נָ֛א וְנִוָּֽכְחָ֖ה יֹאמַ֣ר...,There is no complement to the verb in this cla...,There is no object to the verb in this clause.,There is no explicit subject to the verb in th...
4,212418,The verse is שָׂרַ֣יִךְ סֹורְרִ֗ים וְחַבְרֵי֙ ...,The complement of the verb is אֲלֵיהֶֽם׃ פ .,There is no object to the verb in this clause.,The subject of the verb is רִ֥יב אַלְמָנָ֖ה .
...,...,...,...,...,...
566,234871,The verse is וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִ...,The complement of the verb is עַ֣ל הַ֥ר קָדְשׁ...,The object to the verb is אֶת־כָּל־אֲחֵיכֶ֣ם .,There is no explicit subject to the verb in th...
567,234909,The verse is וְהֵבִ֣יאוּ אֶת־כָּל־אֲחֵיכֶ֣ם מִ...,The complement of the verb is בֵּ֥ית יְהוָֽה׃ .,The object to the verb is אֶת־הַמִּנְחָ֛ה .,The subject of the verb is בְנֵ֨י יִשְׂרָאֵ֧ל .
568,234923,The verse is וְגַם־מֵהֶ֥ם אֶקַּ֛ח לַכֹּהֲנִ֥ים...,The complement of the verb is גַם־מֵהֶ֥ם לַכֹּ...,There is no object to the verb in this clause.,There is no explicit subject to the verb in th...
569,234970,The verse is וְהָיָ֗ה מִֽדֵּי־חֹ֨דֶשׁ֙ בְּחָדְ...,There is no complement to the verb in this cla...,There is no object to the verb in this clause.,The subject of the verb is כָל־בָּשָׂ֛ר .


In [50]:
questions_dataset.to_csv("questions_dataset.csv")

In [30]:
questions_dataset.head(20).to_csv("questions_20.csv")

In [None]:
len(questions)

In [None]:
#questions

In [None]:
df_questions = pd.DataFrame({"Verb_ID": verb_ids, "Questions": questions})

In [None]:
df_questions

In [None]:
df_questions.to_csv("isaiah_gpt_questions.csv", index=False, sep=",")

In [None]:
df_trial_quest = df_questions.head(10)

In [None]:
df_trial_quest

In [None]:
df_trial_quest.to_csv("isaiah_gpt_trial_ques.csv", index=False, sep=",")

In [None]:
# Creating a dataframe from the database to compare with GPT's results

verb

In [None]:
clauses = []
cmpls = []
objects = []

for verb_id in verb_ids:
    clause = TFOb(verb_id, BHSA).to_clauses.text
    cmpl = TFOb(verb_id, BHSA).to_clauses.to_phrases.filter(function="Cmpl").text
    obj = TFOb(verb_id, BHSA).to_clauses.to_phrases.filter(function="Objc").text
    clauses.append(clause)
    cmpls.append(cmpl)
    objects.append(obj)

In [None]:
df_checking = pd.DataFrame({"Verb_ID": verb_ids,
                           "Clause": clauses,
                           "Complement": cmpls,
                            "Object": objects,
                          })

In [None]:
df_checking_trial = df_checking.head(10)

In [None]:
df_checking_trial

###
### TESTING SPACE
###


In [None]:
response = """The motion verb is: toto
Complement number 1: tata
Complement number 2: tutu
"""
lines = response.splitlines()
lines

In [None]:
verb = lines[0].split(":")[-1].strip()
verb

In [None]:
cmpls = []
for line in lines[1:]:
    assert line.count(":") == 1
    cmpls.append(line.split(":")[-1].strip())
cmpls

In [None]:
response_gpt = """In the verse שָׂרַ֣יִךְ סֹורְרִ֗ים וְחַבְרֵי֙ גַּנָּבִ֔ים כֻּלֹּו֙ אֹהֵ֣ב שֹׁ֔חַד וְרֹדֵ֖ף שַׁלְמֹנִ֑ים יָתֹום֙ לֹ֣א יִשְׁפֹּ֔טוּ וְרִ֥יב אַלְמָנָ֖ה לֹֽא־יָבֹ֥וא אֲלֵיהֶֽם׃ פ: 
    1) The clause of the motion verb הָסִ֛ירוּ is: הָסִ֛ירוּ רֹ֥עַ מַעַלְלֵיכֶ֖ם מִנֶּ֣גֶד עֵינָ֑י.
    2) The complement of the motion verb הָסִ֛ירוּ is: מִנֶּ֣גֶד עֵינָ֑י."""

lines_gpt = response_gpt.splitlines()

In [None]:
clause_gpt = lines_gpt[1].split(":")[-1].strip().replace(".", "")
cmpl_gpt = lines_gpt[2].split(":")[-1].strip().replace(".", "")

In [None]:
clause_gpt

In [None]:
cmpl_gpt