In [1]:
import pandas as pd
from tfob import TFOb, get_bhsa, get_dss

BHSA = get_bhsa()
DSS = get_dss()

**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
book,39,10938.21,100
chapter,929,459.19,100
lex,9230,46.22,100
verse,23213,18.38,100
half_verse,45179,9.44,100
sentence,63717,6.7,100
sentence_atom,64514,6.61,100
clause,88131,4.84,100
clause_atom,90704,4.7,100
phrase,253203,1.68,100


**Locating corpus resources ...**

Name,# of nodes,# slots/node,% coverage
scroll,1001,1428.81,100
lex,10450,129.14,94
fragment,11182,127.91,100
line,52895,27.04,100
clause,125,12.85,0
cluster,101099,6.68,47
phrase,315,5.1,0
word,500995,2.81,99
sign,1430241,1.0,100


In [2]:
from collections import Counter
from itertools import chain

### 1.Create two lists with the verses from MT Isaiah and 1QIsaa

#### 1.1 List of wanted motion verbs

In [4]:
df = pd.read_csv("data/isaiah_dataset.csv", sep=";").fillna("")

df["chapter"] = pd.to_numeric(df["chapter"])
df["verse_num"] = pd.to_numeric(df["verse_num"])


df["dir_he"] = df["dir_he"].astype(str)
df["dir_he"] = df["dir_he"].replace("1.0", "1").replace("0.0", "0")

df["dir_he_dss"] = df["dir_he_dss"].astype(str)
df["dir_he_dss"] = df["dir_he_dss"].replace("1.0", "1").replace("0.0", "0")
df.sort_values(["book", "chapter", "verse_num", "scroll"], ascending=[True, True, True, False], ignore_index=True, inplace=True)

df.drop_duplicates(inplace=True)

df.fillna("")

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
0,212256,BW>[,MT,Isaiah,1,12,TB>W,KJ TB>W L R>WT PNJ MJ BQC Z>T M JDKM RMS XYRJ,KJ TB>W,,no complement,,,,qal,impf,,,,
1,1895059,BW>[,1Qisaa,Isaiah,1,12,TB>W,KJ> TB>W L R>WT PNJ MJ BQC ZW>T M JDKM L RMWS ...,,,no complement,0,0,0000000000000000000000000000000000000000000,qal,impf,,,,
2,212269,BW>[,MT,Isaiah,1,13,HBJ>,L> TWSJPW HBJ> MNXT CW> QVRT TW<BH HJ> LJ XDC ...,HBJ> MNXT CW>,,no complement,,,,hif,infc,,,,
3,1895075,BW>[,1Qisaa,Isaiah,1,13,HBJ>,LW> TWSJPW L HBJ> MNXT CW> QVRT TW<BH HJ> LJ X...,,,no complement,0,0,0000000000000000000000000000000000000000000000...,hifil,infc,,,,
4,212315,SWR[,MT,Isaiah,1,16,HSJRW,RXYW HZKW HSJRW R< M<LLJKM M NGD <JNJ XDLW HR<,HSJRW R< M<LLJKM M NGD <JNJ,,M NGD <JNJ,0,,,hif,impv,MN,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1309,1918901,BW>[,1Qisaa,Isaiah,66,23,JBW>,W HJH M DJ XWDC B XWDCW W M DJ CBT B CBTH JBW>...,,,no complement,,0,0000000000000000000000000000000000000000000000...,qal,impf,,,,
1310,1927678,BW>[,1Q8,Isaiah,66,23,JBW>,W HJH M DJ XDC B XDCW W M DJ CBT B CBTW JBW> K...,,,no complement,,0,0000000000000000000000000000000000000000000000...,qal,impf,,,,
1311,234980,JY>[,MT,Isaiah,66,24,JY>W,W JY>W W R>W B PGRJ H >NCJM H PC<JM BJ KJ TWL<...,W JY>W,,no complement,,,,qal,perf,,,,
1312,1918913,JY>[,1Qisaa,Isaiah,66,24,JY>W,W JY>W W R>W B PGRJ H >NCJM H PWC<JM BJ> KJ> T...,,,no complement,,0,0000000000000000000000000000000000000000000000...,qal,perf,,,,


In [49]:
#df.to_csv("data/df_isaiah.csv", index=False)

In [40]:
len(df[(df.scroll == "MT") & (df.complement == "no complement")])

345

In [41]:
len(df2[(df2.scroll == "MT") & (df2.complement == "no complement")])

343

In [50]:
df[df.verb_id == 1915493]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,dir_he,dir_he_dss,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
1126,1915493,CWR=[,1Qisaa,Isaiah,57,9,TCRJ,W TCRJ L MLK B CMN W TRBJ RWQXJK W TCLXJ YJRJK...,W TCRJ L MLK B CMN,,L MLK,0,0,0000000000000000000000000000000000000000000000...,qal,wayy,L,,,
1127,1915493,CWR=[,1Qisaa,Isaiah,57,9,TCRJ,W TCRJ L MLK B CMN W TRBJ RWQXJK W TCLXJ YJRJK...,W TCRJ L MLK B CMN,,L MLK,0,0,0000000000000000000000000000000000000000000000...,qal,wayy,L,,,


In [114]:
df2[df2.verb_id == 1895645]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
36,1895645,XLP[,1Qisaa,Isaiah,2,18,JXLWPW,W H >LJLJM KLJL JXLWPW,W H >LJLJM KLJL JXLWPW,H >LJLJM,no complement,no complement,,,,,,,0 0 000000 0000 000000 00,qal,impf,,,,


In [17]:
len(df[df.scroll == "1Qisaa"])

521

In [16]:
len(df2[df2.scroll == "1Qisaa"])

595

In [44]:
set_df = set([(row.verb_id, row.lex, row.scroll, row.book, row.chapter, row.verse_num) for i, row in df.iterrows()])
len(set_df)

1302

In [45]:
set_df2 = set([(row.verb_id, row.lex, row.scroll, row.book, row.chapter, row.verse_num) for i, row in df2.iterrows()])
len(set_df2)

1395

In [46]:
import pprint 
set_missing = set_df2 - set_df
print(len(set_missing))
pprint.pprint(set_missing)

93
{(1895296, 'CWB[', '1Qisaa', 'Isaiah', 1, 27),
 (1895645, 'XLP[', '1Qisaa', 'Isaiah', 2, 18),
 (1895927, 'T<H[', '1Qisaa', 'Isaiah', 3, 12),
 (1896578, 'MHR[', '1Qisaa', 'Isaiah', 5, 19),
 (1896579, 'XWC[', '1Qisaa', 'Isaiah', 5, 19),
 (1896584, 'QRB[', '1Qisaa', 'Isaiah', 5, 19),
 (1896586, 'BW>[', '1Qisaa', 'Isaiah', 5, 19),
 (1896648, 'SWR[', '1Qisaa', 'Isaiah', 5, 23),
 (1896954, 'SWR[', '1Qisaa', 'Isaiah', 6, 7),
 (1896971, 'HLK[', '1Qisaa', 'Isaiah', 6, 8),
 (1897444, 'BW>[', '1Qisaa', 'Isaiah', 7, 19),
 (1897751, 'CVP[', '1Qisaa', 'Isaiah', 8, 8),
 (1897753, '<BR[', '1Qisaa', 'Isaiah', 8, 8),
 (1897877, 'NPL[', '1Qisaa', 'Isaiah', 8, 15),
 (1898116, 'GLL[', '1Qisaa', 'Isaiah', 9, 4),
 (1898218, 'NPL[', '1Qisaa', 'Isaiah', 9, 9),
 (1898226, 'XLP[', '1Qisaa', 'Isaiah', 9, 9),
 (1898318, 'T<H[', '1Qisaa', 'Isaiah', 9, 15),
 (1898848, 'CVP[', '1Qisaa', 'Isaiah', 10, 22),
 (1898986, '<WZ[', '1Qisaa', 'Isaiah', 10, 31),
 (1899280, 'SWR[', '1Qisaa', 'Isaiah', 11, 13),
 (1899541, 'BW

In [27]:
complement = TFOb(214702, BHSA).to_clauses.to_phrases.filter(function="Cmpl")
complement.to_words.uvf

['H']

In [35]:
verb_dss = TFOb(1897562, DSS)
ref = find_verb_ref(TFOb(1897562, DSS))
scroll = verb_dss.to_scrolls.scroll[0]
TFOb.section(ref, DSS, scroll).uvf_etcbc

['', '', '', '', '', '', 'H', '', '', '', '', '', '', '', '', '']

#### 1.2 List of verb_ids (only for BHSA)

In [25]:
motion_verbs = ['BW>[','HLK[','JY>[','JRD[','<BR[',
                '<LH[','CWB[','>TH[','BRX[','GJX[',
                'GLH[','GLL[','DXP[','DLG[','HWH=[',
                'XWC[','XLP[','XSH[','VB<[','VWF[',
                'MHR[','MWC[','NGC[','NHR[','NWX[',
                'NWS[','NXT[','NVP[','NS<[','NPL[',
                'NTK[','SBB[','SWR[','SLQ[','<WZ[',
                '<WP[','PNH[','PF<[','YWP[','Y<D[',
                'QHL[','QPY[','QRB[','RWY[','FVH[',
                'CWX[','CWR=[','CVP[','CQQ[','T>R[',
                'T<H[']

verbs_bhsa = TFOb.all("word", BHSA).filter(book="Isaiah").filter_in(lex=motion_verbs)
verbs_1qisaa = TFOb.all("scroll", DSS).filter(scroll="1Qisaa").to_words.filter_in(lex=motion_verbs)
verbs_1q8 = TFOb.all("scroll", DSS).filter(scroll="1Q8").to_words.filter_in(lex=motion_verbs)

### 2. Important function

In [30]:
def clean(g_cons):
    """Use to harmonise the DSS content (as strings) with the BHSA content."""
    return g_cons.replace("_", " ").replace("׳", "").replace("'", "")


def find_verb_ref(verb):
    """Returns a list with book, chapter, verse number for a given verb (DSS or BHSA)."""
    if verb.source == "BHSA":
        book = verb.book[0]
        chapter = verb.chapter[0]
        verse_num = verb.verse[0]
    else:
        book = verb.book[0]    
        chapter = verb.chapter[0]
        verse_num = verb.verse[0]
    return [book, chapter, verse_num]


def is_lex_identical(verb_dss): # TODO: handle the "" inside the verse, not only at the end
    """
    Checks if the verses (i.e. BHSA versus DSS) are identical on the lexeme level.
    Remove the empty strings from the DSS verses, if present.
    """
    ref_verb = find_verb_ref(verb_dss)
    
    scroll = verb_dss.to_scrolls.scroll[0]
    
    dss_lex = TFOb.section(ref_verb, DSS, scroll=scroll).to_words.lex
    bhsa_lex = TFOb.section(ref_verb, BHSA).to_words.lex
    
    if dss_lex[-1] == "": #TODO handle the "" inside the verse as well (need to modify many other functions)
        dss_lex.pop()
    
    return [clean(lex) for lex in bhsa_lex] == [clean(lex) for lex in dss_lex]


def find_bhsa_verb(verb_dss):
    """
    Checks if a verb occurring in DSS also occurs in BHSA (same book, chapter, verse, lexeme).
    Else, returns None.
    """
    
    # Get book chapter verse info from a DSS verb
    ref_dss = find_verb_ref(verb_dss)
    
    if not ref_dss[1].isnumeric():
        # Handles the cases when the chapter in DSS is not a simple number (ex: f14)
        # print("Ref DSS not numeric", ref_dss)
        return 

    # Get the corresponding BHSA verse
    verse_bhsa = TFOb.section(ref_dss, BHSA).to_words
    verb_bhsa = verse_bhsa.filter(lex=verb_dss.lex[0])
    
    # If repetition of verb in same verse: TODO
    if len(verb_bhsa) > 1:
        return # TODO
        scroll = verb_dss.to_scrolls.scroll[0]
        verse_dss = TFOb.section(ref_dss, DSS, scroll)
        print("Verse BHSA:", verse_bhsa)
        print("Verse DSS:", verse_dss)
        
    if verb_bhsa:
        return verb_bhsa
    

def find_clause(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        clause = verb.to_clauses.to_clauses
        return clause
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb):
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        clause_bhsa = find_clause(verb_bhsa)
        
    
        first_word_id = clause_bhsa.to_words.ids[0]
        last_word_id = clause_bhsa.to_words.ids[-1]

        verse_ids = clause_bhsa.to_verses.to_words.ids
        
        try: #TODO TODO TODO
            first_word_index = verse_ids.index(first_word_id)
        except:
            print("Case when clause has no verse (to_verses bugs)", verb_bhsa.ids[0])
            return ""

        #first_word_index = verse_ids.index(first_word_id)
        last_word_index = verse_ids.index(last_word_id)
        
        return verse_dss[first_word_index:last_word_index + 1]

    
def find_complements(verb, return_bhsa=False):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        return complements
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): # TODO
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        complements_bhsa = find_complements(verb_bhsa)
        if return_bhsa:
            return complements_bhsa
        
        complements_dss = []
    
        for complement_bhsa in complements_bhsa:
            first_word_id = complement_bhsa.to_words.ids[0]
            last_word_id = complement_bhsa.to_words.ids[-1]
            
            verse_ids = complement_bhsa.to_verses.to_words.ids
            
            try: #TODO TODO TODO
                first_word_index = verse_ids.index(first_word_id)
            except:
                print("Case when phrase has no verse (to_verses bugs)", verb_bhsa.ids[0])
                return ""
            
            #first_word_index = verse_ids.index(first_word_id)
            last_word_index = verse_ids.index(last_word_id)
            
            complements_dss.append(verse_dss[first_word_index:last_word_index + 1])
        
        return complements_dss

    
def find_subject(verb):
    """Find the subject of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        subjects = verb.to_clauses.to_phrases.filter(function="Subj")
        assert len(subjects) <= 1
        return subjects
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): # TODO
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        subject_bhsa = find_subject(verb_bhsa)
        
        if not subject_bhsa: 
            return ""

        first_word_id = subject_bhsa.to_words.ids[0]
        last_word_id = subject_bhsa.to_words.ids[-1]

        verse_ids = subject_bhsa.to_verses.to_words.ids

        first_word_index = verse_ids.index(first_word_id)
        last_word_index = verse_ids.index(last_word_id)
            
        return verse_dss[first_word_index:last_word_index + 1]    
    

def find_prepositions(verb):
    """Find the complement of a verb. If no match, returns None"""
    if verb.source.name == "BHSA":
        complements = verb.to_clauses.to_phrases.filter(function="Cmpl")
        prepositions = complements.to_words.filter(sp="prep")
        return prepositions
    
    # if the verb is not BHSA, it's DSS
    verb_bhsa = find_bhsa_verb(verb)
    scroll = verb.to_scrolls.scroll[0]

    # Check if verses are identical  
    if verb_bhsa and is_lex_identical(verb): 
        verse_dss = TFOb.section(find_verb_ref(verb), DSS, scroll=scroll).to_words
        prepositions_bhsa = find_prepositions(verb_bhsa)
        
        prepositions_dss = []
    
        for preposition_bhsa in prepositions_bhsa:
            first_word_id = preposition_bhsa.to_words.ids[0]
            last_word_id = preposition_bhsa.to_words.ids[-1]
            
            verse_ids = preposition_bhsa.to_verses.to_words.ids
            
            try: #TODO TODO TODO
                first_word_index = verse_ids.index(first_word_id)
            except:
                print("Case when phrase has no verse (to_verses bugs)", verb_bhsa.ids[0])
                return ""
            
            #first_word_index = verse_ids.index(first_word_id)
            last_word_index = verse_ids.index(last_word_id)
            
            prepositions_dss.append(verse_dss[first_word_index:last_word_index + 1])
        
        return prepositions_dss
    
def is_sign_unc(verse):
    """If a verse contains a missing or uncertain sign, returns True. Else, returns False."""
    
    unc_types = ['missing', 'unc']
    verse_sign_types = []
    
    for word in verse.to_words:

        word_signs = []

        for sign in word.to_signs:
            if sign.type[0] in unc_types:
                sign_type = "1"
            else:
                sign_type = "0"

            word_signs.append(sign_type)

        verse_sign_types.append("".join(word_signs))    
    return " ".join(verse_sign_types)

    
def find_cmpl_anim(cmpl):
    """Based on the nametype value, returns 'anim' or 'inanim' for one complement of a given verb. 
    If the nametype does not provide a clear distinction, returns 'check'.
    If there is no complement to the verb, returns None."""
    
    anim = ["pers", "gens", "god"]
    inanim = ["mens", "topo"]

    nouns = cmpl.to_words.filter_in(sp=["subs", "nmpr"])
    nouns_animacy = []
    for noun in nouns:
        nt = noun.nametype[0]
        if nt in anim:
            nouns_animacy.append("anim")
        elif nt in inanim:
            nouns_animacy.append("inanim")
    return " ".join(nouns_animacy)


def find_cmpl_nametype(cmpl):
    words = cmpl.to_words
    cmpl_nt = []
    for word in words:
        if word.nametype[0]:
            nt = word.nametype[0]
            cmpl_nt.append(nt)
    return " ".join(cmpl_nt)


def find_cmpl_individuation(cmpl):
    """Returns whether the nouns in a cmpl are substantives or proper nouns."""
    cmpl_indiv = []
    for word in cmpl.to_words.filter_in(sp=["subs","nmpr"]):
        cmpl_indiv.append(word.sp[0])
    return " ".join(cmpl_indiv)


def find_cmpl_construction(cmpl):
    """Returns the construction of the complement: vc (for verbal complement), prep for prepositional complement
    and dir_he for complement with directive-he (or a combination of those)."""
    
    cmpl_construction = []
    
    for word in cmpl.to_words:
        if word.filter(sp="prep"):
            cmpl_construction.append("prep")
        elif "H" in word.uvf:
            cmpl_construction.append("dir_he")
            
    if cmpl_construction == []:
        return "vc"
    else:
        return " + ".join(cmpl_construction)

In [28]:
pd.set_option('display.max_columns', None)
#df[(df["complement"] != "no complement") & (df["scroll"] == "MT")]

In [29]:
verb = TFOb(1895945, DSS)
find_complements_lexemes(verb)

[]

### 3. Generate dataset

In [36]:
# Create a dataset with the occurrences


items = [] # create an empty list to store all the information for each occ.

       
for verb in chain(verbs_bhsa, verbs_1qisaa, verbs_1q8):
    
    
    # Add MT as "scroll" for the BHSA
    if verb.source.name == "BHSA":
        scroll = "MT"
        verse = verb.to_verses
        sign_info = ""

    else:
        scroll = verb.to_scrolls.scroll[0]
        verse = TFOb.section([verb.book[0], verb.chapter[0], verb.verse[0]], DSS, scroll)
        sign_info = is_sign_unc(verse)
        
        
    subject = find_subject(verb)
    complements = find_complements(verb)
    complements_bhsa = find_complements(verb, return_bhsa=True)
    
    
    # If complements is None ==> there was no match between DSS and BHSA verses ==> find complement manually
    if not complements:
        complements = ["no complement"]
        dir_he = ""
        complements_lexemes = ["no complement"]
    else:
        complements_lexemes = [" ".join(cmpl.to_words.lex) for cmpl in complements]

    if not complements_bhsa:
        complements_bhsa = ["no complement"]
        
    for complement, complement_bhsa, complement_lexeme in zip(complements, complements_bhsa, complements_lexemes):        
        if complement == "no complement":
            dir_he = cmpl_anim = cmpl_nt = cmpl_det = cmpl_indiv = cmpl_constr = ""
            cmpl_lex = "no complement"
        else: 
            cmpl_lex = complement_lexeme
            
            if verb.source.name == "BHSA":
                dir_he = int("H" in complement.to_words.uvf)
            else:
                dir_he = int("H" in complement.to_words.uvf_etcbc)
                
            cmpl_anim = find_cmpl_anim(complement_bhsa)
            cmpl_nt = find_cmpl_nametype(complement_bhsa)
            cmpl_det = complement_bhsa.det[0]
            cmpl_indiv = find_cmpl_individuation(complement_bhsa)
            cmpl_constr = find_cmpl_construction(complement_bhsa)
               
        if verb.g_cons[0] is None:
            g_cons = "no_g_cons"
            #print("Absent G_CONS", verb.ids[0])
        else:
            g_cons = clean(verb.g_cons[0])    
            
        if verse.g_cons is None:
            g_cons_verse = f"LEX: {' '.join(verse.to_words.lex)}"
            #print("Absent G_CONS", verb.ids[0])
        else:
            g_cons_verse = clean(" ".join([g_cons for g_cons in verse.to_words.g_cons if g_cons]))
            #print(g_cons_verse)
            
            
        
        # Collect information about the following variables:    
        item = {
            "verb_id": verb.ids[0], 
            "lex": verb.lex[0], 
            "scroll": scroll,
            "book": verb.book[0], 
            "chapter": int(verb.chapter[0]), 
            "verse_num": int(verb.verse[0]),
            "gcons_verb": g_cons,
            "gcons_verse": g_cons_verse,
            "gcons_clause": clean(str(find_clause(verb))),
            "subject": clean(str(subject)),
            "complement": clean(str(complement)),
            "cmpl_lex": clean(str(cmpl_lex)),
            "dir_he": dir_he,
            "cmpl_constr": cmpl_constr,
            "cmpl_nt": cmpl_nt,
            "cmpl_anim": cmpl_anim,
            "cmpl_det": cmpl_det,
            "cmpl_indiv": cmpl_indiv,
            #"cmpl_complex": cmpl_complex,
            "sign_info": sign_info,
            "stem": verb.vs[0],
            "tense": verb.vt[0],
        }
        
        if complement != "no complement": 
            prepositions = find_prepositions(verb)
            n = 0
            for preposition in prepositions:
                n += 1
                item[f"preposition_{n}"] = str(preposition)

        items.append(item)

In [37]:
#items

In [38]:
df2 = pd.DataFrame(items).fillna("")
df2.sort_values(["book", "chapter", "verse_num", "scroll"], ascending=[True, True, True, False], ignore_index=True, inplace=True)

In [39]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

df2
#df2[(df2["scroll"] == "MT") & (df2["complement"] == "no complement")]
#df2[(df2["complement"] != "no complement")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,gcons_clause,subject,complement,cmpl_lex,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,sign_info,stem,tense,preposition_1,preposition_2,preposition_3,preposition_4
0,212256,BW>[,MT,Isaiah,1,12,TB>W,KJ TB>W L R>WT PNJ MJ BQC Z>T M JDKM RMS XYRJ,KJ TB>W,,no complement,no complement,,,,,,,,qal,impf,,,,
1,1895059,BW>[,1Qisaa,Isaiah,1,12,TB>W,KJ> TB>W L R>WT PNJ MJ BQC ZW>T M JDKM L RMWS ...,,,no complement,no complement,,,,,,,000 0000 0 0000 0000 00 000 0000 0 00000 0 000...,qal,impf,,,,
2,212269,BW>[,MT,Isaiah,1,13,HBJ>,L> TWSJPW HBJ> MNXT CW> QVRT TW<BH HJ> LJ XDC ...,HBJ> MNXT CW>,,no complement,no complement,,,,,,,,hif,infc,,,,
3,1895075,BW>[,1Qisaa,Isaiah,1,13,HBJ>,LW> TWSJPW L HBJ> MNXT CW> QVRT TW<BH HJ> LJ X...,,,no complement,no complement,,,,,,,000 000000 0 0000 0000 000 0000 00000 000 000 ...,hifil,infc,,,,
4,212315,SWR[,MT,Isaiah,1,16,HSJRW,RXYW HZKW HSJRW R< M<LLJKM M NGD <JNJ XDLW HR<,HSJRW R< M<LLJKM M NGD <JNJ,,M NGD <JNJ,MN NGD/ <JN/,0.0,prep,,,det,subs subs,,hif,impv,M,,,
5,1895129,SWR[,1Qisaa,Isaiah,1,16,HSJRW,RXYW W HZKW W HSJRW RW< M<LLJKM M NGD <JNJ XDL...,,,no complement,no complement,,,,,,,0000 0 0000 0 00000 000 00000000 0 000 00000 0...,hifil,impv,,,,
6,212333,HLK[,MT,Isaiah,1,18,LKW,LKW N> W NWKXH J>MR JHWH >M JHJW XV>JKM K CNJM...,LKW N>,,no complement,no complement,,,,,,,,qal,impv,,,,
7,1895149,HLK[,1Qisaa,Isaiah,1,18,LKW,LKW N> W NWKXH JW>MR JHWH >M JHJW XV>JKM K CNJ...,,,no complement,no complement,,,,,,,000 00 0 00000 00000 0000 00 0000 0000000 0 00...,qal,impv,,,,
8,212418,BW>[,MT,Isaiah,1,23,JBW>,FRJK SWRRJM W XBRJ GNBJM KLW >HB CXD W RDP CLM...,W RJB >LMNH L> JBW> >LJHM,RJB >LMNH,>LJHM,>L,0.0,prep,,,det,,,qal,impf,>LJHM,,,
9,1895234,BW>[,1Qisaa,Isaiah,1,23,JBW>,FRJKJ SWRRJM W XBRJ GNBJM KWLM >WHBJ CWXD RWDP...,,,no complement,no complement,,,,,,,000000 000000 0 0000 00000 00000 00000 0000 00...,qal,impf,,,,


In [48]:
#df2.to_csv("data/df2_isaiah.csv", index=False)

### 4. Create the dataframe with pandas

In [None]:
df2 = pd.DataFrame(items).fillna("")

In [None]:
df2[df2.verb_id == 213114]

In [None]:
df[df.verb_id == 213114]

In [None]:
len(df)

In [None]:
len(df2)

In [None]:
# find duplicate rows based on Name and City columns
duplicate_rows = df2.duplicated()

# print duplicate rows
#print(duplicate_rows)

df2[df2[["verb_id", "cmpl_constr", "cmpl_nt", "cmpl_anim", "cmpl_det", "cmpl_indiv", "preposition_1", "preposition_2", "preposition_3", "preposition_4"]].duplicated()]

In [47]:
set_df = set([(row.verb_id, row.lex, row.scroll, row.book, row.chapter, row.verse_num) for i, row in df.iterrows()])
len(set_df)

1302