In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency

## 1. Data

In [2]:
# Import the original dataset

df_complete = pd.read_csv("data/sbl_grouped_cmpls_for_annot.csv").fillna("")

In [3]:
len(df_complete)

660

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
df_complete.head(4)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse
0,212418,BW>[,MT,Isaiah,1,23,JBW>,FRJK SWRRJM W XBRJ GNBJM KLW >HB CXD W RDP CLM...,,qal,impf,W RJB >LMNH L> JBW> >LJHM,RJB >LMNH,>LJHM,>L,to them,0.0,prep,prps,anim,det,prsf,simple,fictive,goal,>L,,,,"the cause of the widow does not come to them, ...",
1,212437,CWB[,MT,Isaiah,1,25,>CJBH,W >CJBH JDJ <LJK W >YRP K BR SJGJK W >SJRH KL ...,,hif,impf,W >CJBH JDJ <LJK,,<LJK,<L,against you,0.0,prep,prps,anim,det,prsf,simple,posture/not motion,goal/recipient,<L,,,,,
2,212564,NHR[,MT,Isaiah,2,2,NHRW,W HJH B >XRJT H JMJM NKWN JHJH HR BJT JHWH B R...,,qal,perf,W NHRW >LJW KL H GWJM,KL H GWJM,>LJW,>L,to it (the mountain),0.0,prep,prps,inanim,det,prsf,simple,factive,goal,>L,,,,,
3,212577,<LH[,MT,Isaiah,2,3,N<LH,W HLKW <MJM RBJM W >MRW LKW W N<LH >L HR JHWH ...,,qal,impf,W N<LH >L HR JHWH >L BJT >LHJ J<QB,,>L HR JHWH >L BJT >LHJ J<QB,>L HR/ JHWH/ >L BJT/ >LHJM/ J<QB/,to the mountain of Yahwe to the house of the G...,0.0,prep,topo,inanim,det,subs,complex,factive,goal,>L,>L,,,,


In [6]:
# Select only the rows of goals

In [7]:
df = df_complete[df_complete.spatial_arg_type == "goal"].copy()

In [8]:
len(df)

412

In [9]:
set(df.cmpl_constr)

{'dir-he',
 'dir_he',
 'prep',
 'prep + dir-he',
 'prep + prep',
 'prep + prep + prep',
 'vc'}

In [10]:
df[df.cmpl_constr == "prep + dir-he"]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse
24,215112,PNH[,MT,Isaiah,8,21,PNH,W <BR BH NQCH W R<B W HJH KJ JR<B W HTQYP W QL...,,qal,perf,W PNH L M<LH,,L M<LH,L M<L/,upwards,1.0,prep + dir-he,dir,inanim,und,subs,simple,posture/not motion,goal,L,,,,,
654,32836,BW>[,MT,Exodus,8,20,JB>,W J<F JHWH KN W JB> <RB KBD BJTH PR<H W BJT <B...,,qal,wayq,W JB> <RB KBD BJTH PR<H W BJT <BDJW W B KL >RY...,<RB KBD,BJTH PR<H W BJT <BDJW W B KL >RY MYRJM,BJT/ PR<H/ W BJT/ <BD/ W B KL/ >RY/ MYRJM/,in the house of Pharaoh and the house of his s...,1.0,prep + dir-he,topo,inanim,det,subs,complex,factive,goal,B,,,,,


In [11]:
# Add a new variable: construction (prepositional vs non prepositional, prep / non_prep)
set(df.cmpl_constr)

# Create a mapping dictionary
mapping = {
    'dir-he': 'non_prep',
    'dir_he': 'non_prep',
    'vc': 'non_prep',
    'prep': 'prep',
    'prep + dir-he': 'prep',
    'prep + prep': 'prep',
    'prep + prep + prep': 'prep',
}

# Create the new column 'construction' using the map function
df['construction'] = df['cmpl_constr'].map(mapping)

# Verification
print("Set of cmpl_constr for prep construction: ", set(df[df.construction == "prep"].cmpl_constr))
print("Set of cmpl_constr for non_prep construction: ", set(df[df.construction == "non_prep"].cmpl_constr))

Set of cmpl_constr for prep construction:  {'prep + prep', 'prep + dir-he', 'prep + prep + prep', 'prep'}
Set of cmpl_constr for non_prep construction:  {'vc', 'dir-he', 'dir_he'}


## 2. Chi Square Test of Independence: Cmpl Construction and Cmpl Animacy

In [12]:
# Contingency table
contingency_table_anim = pd.crosstab(df['construction'], df['cmpl_anim'])

# Convert the contingency table to a numpy array
observed = contingency_table_anim.values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate\n", expected)

Chi-Square Statistic: 48.89837235291553
P-value: 2.695743590669114e-12
Degrees of Freedom: 1
Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate
 [[ 27.78883495  79.21116505]
 [ 79.21116505 225.78883495]]


In [13]:
contingency_table_anim

cmpl_anim,anim,inanim
construction,Unnamed: 1_level_1,Unnamed: 2_level_1
non_prep,0,107
prep,107,198


In [14]:
len(df[(df.construction == "prep") & (df.cmpl_anim == "inanim")])

198

In [15]:
len(df[(df.construction == "prep") & (df.cmpl_anim == "anim")])

107

In [16]:
#df[(df.construction == "prep") & (df.cmpl_anim == "anim")].head(3)

In [17]:
len(df[(df.construction == "non_prep") & (df.cmpl_anim == "inanim")])

107

## 3. Chi Square Test of Independence: Cmpl Construction and Cmpl Definiteness

In [18]:
# Contingency table
contingency_table_det = pd.crosstab(df['construction'], df['cmpl_det'])

# Convert the contingency table to a numpy array
observed = contingency_table_det.values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate\n", expected)

Chi-Square Statistic: 0.23746666204667752
P-value: 0.6260418421354075
Degrees of Freedom: 1
Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate
 [[101.5461165   5.4538835]
 [289.4538835  15.5461165]]


In [19]:
observed

array([[103,   4],
       [288,  17]], dtype=int64)

In [20]:
contingency_table_det

cmpl_det,det,und
construction,Unnamed: 1_level_1,Unnamed: 2_level_1
non_prep,103,4
prep,288,17


## 4. Different approach to constructions: prep / dir-he / vc

In [21]:
# Add a new variable: construction_2 (dir-he, vc, prep)
set(df.cmpl_constr)

# Create a mapping dictionary
mapping_2 = {
    'dir-he': 'dir-he',
    'dir_he': 'dir-he',
    'vc': 'vc',
    'prep': 'prep',
    'prep + dir-he': 'prep',
    'prep + prep': 'prep',
    'prep + prep + prep': 'prep',
}

# Create the new column 'construction' using the map function
df['construction_2'] = df['cmpl_constr'].map(mapping_2)

# Verification
print("Set of cmpl_constr for prep construction: ", set(df[df.construction_2 == "prep"].cmpl_constr))
print("Set of cmpl_constr for vc construction: ", set(df[df.construction_2 == "vc"].cmpl_constr))
print("Set of cmpl_constr for dir-he construction: ", set(df[df.construction_2 == "dir-he"].cmpl_constr))

Set of cmpl_constr for prep construction:  {'prep + prep', 'prep + dir-he', 'prep + prep + prep', 'prep'}
Set of cmpl_constr for vc construction:  {'vc'}
Set of cmpl_constr for dir-he construction:  {'dir-he', 'dir_he'}


## 5. Chi Square Test of independence Definiteness and Construction2

In [22]:
# Contingency table
contingency_table_det = pd.crosstab(df['construction_2'], df['cmpl_det'])

# Convert the contingency table to a numpy array
observed = contingency_table_det.values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate\n", expected)

Chi-Square Statistic: 6.164724712708357
P-value: 0.04585081264918512
Degrees of Freedom: 2
Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate
 [[ 65.48300971   3.51699029]
 [289.4538835   15.5461165 ]
 [ 36.0631068    1.9368932 ]]


In [23]:
observed

array([[ 69,   0],
       [288,  17],
       [ 34,   4]], dtype=int64)

In [24]:
contingency_table_det

cmpl_det,det,und
construction_2,Unnamed: 1_level_1,Unnamed: 2_level_1
dir-he,69,0
prep,288,17
vc,34,4


In [25]:
df[(df.cmpl_det == "und") & (df.construction_2 == "vc")]

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse,construction,construction_2
39,216991,JRD[,MT,Isaiah,14,11,HWRD,HWRD C>WL G>WNK HMJT NBLJK TXTJK JY< RMH W MKS...,,hof,perf,HWRD C>WL G>WNK HMJT NBLJK,G>WNK HMJT NBLJK,C>WL,C>WL/,to the underworld,0.0,vc,topo,inanim,und,subs,simple,fictive,goal,,,,,,,non_prep,vc
59,219565,<BR[,MT,Isaiah,23,12,<BRJ,W J>MR L> TWSJPJ <WD L <LWZ H M<CQH BTWLT BT Y...,,qal,impv,KTJJM <BRJ,,KTJJM,KTJ/,to Cyprus,0.0,vc,topo,inanim,und,subs,simple,factive,goal,,,,,KTJ/ not a proper noun (BH),,non_prep,vc
93,224721,<LH[,MT,Isaiah,37,24,<LJTJ,B JD <BDJK XRPT >DNJ W T>MR B RB RKBJ >NJ <LJT...,,qal,perf,B RB RKBJ >NJ <LJTJ MRWM HRJM JRKTJ LBNWN,>NJ,MRWM HRJM JRKTJ LBNWN,MRWM/ HR/ JRKH/ LBNWN/,"to the heights of the mountains, to the remote...",0.0,vc,topo,inanim,und,subs,complex,factive,goal,,,,,,,non_prep,vc
153,232056,BW>[,MT,Isaiah,58,7,TBJ>,H LW> PRS L R<B LXMK W <NJJM MRWDJM TBJ> BJT K...,,hif,impf,W <NJJM MRWDJM TBJ> BJT,,BJT,BJT/,to the house,0.0,vc,topo,inanim,und,subs,simple,factive,goal,,,,,,,non_prep,vc


## 6. Excluding sham?

In [31]:
df_without_sham = df[df.cmpl_lex != "CM"].copy()
len(df_without_sham)

371

In [34]:
df_without_sham.head(2)

Unnamed: 0,verb_id,lex,scroll,book,chapter,verse_num,gcons_verb,gcons_verse,sign_info,stem,tense,gcons_clause,subject,complement,cmpl_lex,cmpl_translation,dir_he,cmpl_constr,cmpl_nt,cmpl_anim,cmpl_det,cmpl_indiv,cmpl_complex,motion_type,spatial_arg_type,preposition_1,preposition_2,preposition_3,preposition_4,comments,reconstructed_verse,construction,construction_2
0,212418,BW>[,MT,Isaiah,1,23,JBW>,FRJK SWRRJM W XBRJ GNBJM KLW >HB CXD W RDP CLM...,,qal,impf,W RJB >LMNH L> JBW> >LJHM,RJB >LMNH,>LJHM,>L,to them,0.0,prep,prps,anim,det,prsf,simple,fictive,goal,>L,,,,"the cause of the widow does not come to them, ...",,prep,prep
2,212564,NHR[,MT,Isaiah,2,2,NHRW,W HJH B >XRJT H JMJM NKWN JHJH HR BJT JHWH B R...,,qal,perf,W NHRW >LJW KL H GWJM,KL H GWJM,>LJW,>L,to it (the mountain),0.0,prep,prps,inanim,det,prsf,simple,factive,goal,>L,,,,,,prep,prep


In [35]:
# Contingency table
contingency_table_det = pd.crosstab(df_without_sham['construction'], df_without_sham['cmpl_det'])

# Convert the contingency table to a numpy array
observed = contingency_table_det.values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate\n", expected)

Chi-Square Statistic: 0.0
P-value: 1.0
Degrees of Freedom: 1
Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate
 [[ 62.26415094   3.73584906]
 [287.73584906  17.26415094]]


In [36]:
contingency_table_det = contingency_table_det.T

In [37]:
contingency_table_det

construction,non_prep,prep
cmpl_det,Unnamed: 1_level_1,Unnamed: 2_level_1
det,62,288
und,4,17


In [38]:
contingency_table_det.to_csv("data/sbl_perez_without_sham.csv")

In [39]:
# Contingency table
contingency_table_det = pd.crosstab(df['construction'], df['cmpl_det'])

# Convert the contingency table to a numpy array
observed = contingency_table_det.values

# Perform the chi-square test
chi2, p, dof, expected = chi2_contingency(observed)

# Output the results
print("Chi-Square Statistic:", chi2)
print("P-value:", p)
print("Degrees of Freedom:", dof)
print("Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate\n", expected)

Chi-Square Statistic: 0.23746666204667752
P-value: 0.6260418421354075
Degrees of Freedom: 1
Expected Frequencies: the expected frequencies should all be >= 5 for the Chi2 test to be appropriate
 [[101.5461165   5.4538835]
 [289.4538835  15.5461165]]


In [41]:
contingency_table_det = contingency_table_det.T
contingency_table_det

construction,non_prep,prep
cmpl_det,Unnamed: 1_level_1,Unnamed: 2_level_1
det,103,288
und,4,17


In [42]:
contingency_table_det.to_csv("data/sbl_perez_with_sham.csv")