In [4]:
import pandas as pd

from tfob import TFOb, BHSA, DSS

### Introduction

The goal of the notebook is to create an input and an output dataset to train a transformer model for parsing clause atoms in Biblical Hebrew based on POS-only input.
Each dataset should contain four columns separated by TABS: book, chapter, verse, text

#### What is needed in the text column:
- input file: the POS of the verse (POS represented by 1 letter)
- output file: POS letters with the end of the clause atom marked by a capital A

In [4]:
# Create a DataFrame by columns
#pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})

# Create a DataFrame line by line
#pd.DataFrame([{"A":1, "B":4}, {"A":2, "B":5}, {"A":3, "B":6}, ])

In [100]:
# Gathering all the verses in BHSA
verses_bhsa = TFOb.all("verse", BHSA)

### PART 1. Creating the INPUT DATASET

In [101]:
# POS dictionary

pos_dict = {"art": "D",
            "verb": "V",
            "subs": "S",
            "noun": "N",
            "nmpr": "n",
            "advb": "A",
            "prep": "P",
            "conj": "c",
            "prps": "p",
            "prde": "d",
            "prin": "i",
            "intj": "I",
            "nega": "g",
            "inrg": "?",
            "adjv": "a"
           }

In [102]:
# Define a function for getting POS

def get_pos(verse):
    """Returns the POS from a each word of a given verse joined with space."""
    
    verse_pos = [pos_dict[word.sp[0]] for word in verse.to_words]
    joined_pos = " ".join(verse_pos)
    return joined_pos

In [103]:
### TEST AREA ###

In [81]:
verse = verses_bhsa[0]

In [82]:
print(get_pos(verse))

P S V S P D S c P D S


In [83]:
poss = [pos_dict[word.sp[0]] for word in verse.to_words]

In [85]:
verse

<verse_1 "B R>CJT BR> >LHJM >T H CMJM W >T H >RY">

In [84]:
poss

['P', 'S', 'V', 'S', 'P', 'D', 'S', 'c', 'P', 'D', 'S']

In [None]:
### END OF TEST AREA ###

In [112]:
# Generate the input file

books = verses_bhsa.book
chapters = verses_bhsa.chapter
verses_num = verses_bhsa.verse
verse_pos = [get_pos(verse) for verse in verses_bhsa]

In [113]:
input_df = pd.DataFrame({"book": books, "chapter": chapters, "verse": verses_num, "text": verse_pos})

In [114]:
input_df

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,P S V S P D S c P D S
1,Genesis,1,2,c D S V S c S c S P S S c S S V P S D S
2,Genesis,1,3,c V S V S c V S
3,Genesis,1,4,c V S P D S c V c V S S D S c S D S
4,Genesis,1,5,c V S P D S S c P D S V S c V S c V S S S
...,...,...,...,...
23208,2_Chronicles,36,19,c V P S D S c V P S n c S S V P D S c S S S P V
23209,2_Chronicles,36,20,c V D S P D S P n c V P c P S P S P V S n
23210,2_Chronicles,36,21,P V S n P S n P V D S P S S S D S V P V S S
23211,2_Chronicles,36,22,c P S S P n S n P V S n P S n V n P S n S n c ...


In [115]:
# Save input with POS
input_df.to_csv("data/cl_pos_in.csv", sep="\t", header=False, index=False)

### PART 2: Creating the OUTPUT DATASET

In [106]:
# Define a function to parse the clause atom

def parse_cl_atom(verse):
    """Takes a verse (TFOb object) and returns a parsed version with the POS of each word 
    and the end of the clause_atom indicated by a "#". """

    # Parse the clause atoms in POS and indicating the end of the clause atom with an "#" 
    parsed_verses = []
    
    for clause_atom in verse.to_clause_atoms:
        pos_clause = get_pos(clause_atom)
        def_end_cl = str(pos_clause) + "#"
        parsed_verses.append(def_end_cl)
        
    parsed_verse = " ".join(parsed_verses) 
     
    return "".join(parsed_verse)

In [107]:
# Testing the function
verse = verses_bhsa[4]

print(f"The verse is: \n\n{get_pos(verse)}\n\n")
print(f"The parsed version of the verse is: \n\n{parse_cl_atom(verse)}.")

The verse is: 

c V S P D S S c P D S V S c V S c V S S S


The parsed version of the verse is: 

c V S P D S S# c P D S V S# c V S# c V S# S S#.


In [108]:
# Create a list of parsed verses

books = verses_bhsa.book
chapters = verses_bhsa.chapter
verses_num = verses_bhsa.verse
parsed_verses = [parse_cl_atom(verse) for verse in verses_bhsa]

In [109]:
output_df = pd.DataFrame({"book": books, "chapter": chapters, "verse": verses_num, "text": parsed_verses})

In [110]:
output_df

Unnamed: 0,book,chapter,verse,text
0,Genesis,1,1,P S V S P D S c P D S#
1,Genesis,1,2,c D S V S c S# c S P S S# c S S V P S D S#
2,Genesis,1,3,c V S# V S# c V S#
3,Genesis,1,4,c V S P D S# c V# c V S S D S c S D S#
4,Genesis,1,5,c V S P D S S# c P D S V S# c V S# c V S# S S#
...,...,...,...,...
23208,2_Chronicles,36,19,c V P S D S# c V P S n# c S S V P D S# c S S S...
23209,2_Chronicles,36,20,c V D S P D S P n# c V P c P S P S# P V S n#
23210,2_Chronicles,36,21,P V S n P S n# P V D S P S# S S D S V# P V S S#
23211,2_Chronicles,36,22,c P S S P n S n# P V S n P S n# V n P S n S n#...


In [111]:
# Save output POS and "#"

output_df.to_csv("data/cl_pos_out.csv", sep="\t", header=False, index=False)