In [1]:
import stanza
import pandas as pd
import re
import unicodedata
from nltk.tree import Tree
from nltk.draw.tree import TreeView
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://stanfordnlp.github.io/stanza/constituency.html
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package={'constituency': 'wsj_bert'})

2023-08-06 22:16:28 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 7.02MB/s]             
2023-08-06 22:16:29 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj_bert |

2023-08-06 22:16:29 INFO: Using device: cpu
2023-08-06 22:16:29 INFO: Loading: tokenize
2023-08-06 22:16:29 INFO: Loading: pos
2023-08-06 22:16:29 INFO: Loading: constituency
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS ex

In [3]:
# Prior to constituency parse norming: read in stim CSV. Drop 'locus of uncertainty ' column...
# There should be exactly four rows per 'item' type (besides fillers)
stims = pd.read_csv("../../experiments/2_falseconsensus_syntax/experiment/policy_wordings.csv").drop(['locus_of_uncertainty'], axis = 1)
stims.head()

Unnamed: 0,item,center_embedding,passive,policy_wording
0,Escape Of Oil,yes,yes,An incident that occurs with oil that is leake...
1,Escape Of Oil,yes,no,An incident that occurs with a fixed heating i...
2,Escape Of Oil,no,no,An incident occurs with a fixed heating instal...
3,Escape Of Oil,no,yes,An incident occurs with oil that is leaked by ...
4,Vehicle Glass,yes,yes,An incident that occurs with vehicle glass tha...


In [4]:
# Step 1: constituency parse the policy wording 
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', package={'constituency': 'wsj_bert'})
stims['parsed_policy'] = stims['policy_wording'].map(lambda x : 
                                            [i.constituency for i in nlp(x.replace('"', " ").replace(".","")).sentences])

2023-08-06 22:16:34 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 1.31MB/s]             
2023-08-06 22:16:35 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj_bert |

2023-08-06 22:16:35 INFO: Using device: cpu
2023-08-06 22:16:35 INFO: Loading: tokenize
2023-08-06 22:16:35 INFO: Loading: pos
2023-08-06 22:16:35 INFO: Loading: constituency
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS ex

In [5]:
stims['parsed_policy'].head()

0    [(ROOT (S (NP (NP (DT An) (NN incident)) (SBAR...
1    [(ROOT (S (NP (NP (DT An) (NN incident)) (SBAR...
2    [(ROOT (S (NP (DT An) (NN incident)) (VP (VBZ ...
3    [(ROOT (S (NP (DT An) (NN incident)) (VP (VBZ ...
4    [(ROOT (S (NP (NP (DT An) (NN incident)) (SBAR...
Name: parsed_policy, dtype: object

In [6]:
# Step 3: render and save the constituency parses 
def render_parse(stim_row): 
    for i in range(len(stim_row['parsed_policy'])): 
        filename = "constituency-parses/" + stim_row['item'].replace(" ","") + "_" + stim_row['center_embedding'] + "cembed_" + stim_row['passive'] + "passive_" + str(i + 1)       
        tree = stim_row['parsed_policy'][i]
        TreeView(Tree.fromstring(str(tree)))._cframe.print_to_file(filename+".ps")
        os.system('convert {}.ps {}.png'.format(filename,filename))
        os.system('rm {}.ps'.format(filename))
        print(filename)

stims.apply(render_parse, axis=1)

constituency-parses/EscapeOfOil_yescembed_yespassive_1
constituency-parses/EscapeOfOil_yescembed_nopassive_1
constituency-parses/EscapeOfOil_nocembed_nopassive_1
constituency-parses/EscapeOfOil_nocembed_nopassive_2
constituency-parses/EscapeOfOil_nocembed_yespassive_1
constituency-parses/EscapeOfOil_nocembed_yespassive_2
constituency-parses/VehicleGlass_yescembed_yespassive_1
constituency-parses/VehicleGlass_yescembed_nopassive_1
constituency-parses/VehicleGlass_nocembed_nopassive_1
constituency-parses/VehicleGlass_nocembed_nopassive_2
constituency-parses/VehicleGlass_nocembed_yespassive_1
constituency-parses/VehicleGlass_nocembed_yespassive_2
constituency-parses/EmergencyDamages_yescembed_yespassive_1
constituency-parses/EmergencyDamages_yescembed_nopassive_1
constituency-parses/EmergencyDamages_nocembed_nopassive_1
constituency-parses/EmergencyDamages_nocembed_nopassive_2
constituency-parses/EmergencyDamages_nocembed_yespassive_1
constituency-parses/EmergencyDamages_nocembed_yespassi

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
49    None
dtype: object

In [7]:
# Step 4: confirm that the constituency parses conform to our norming criteria.

def check_parse(stim): 
    print("---------")
    print("Item: {} * center_embedding: {} * passive: {}".format(stim['item'], stim.center_embedding, stim.passive))
    
    if stim.center_embedding == "yes":
                
        # confirm that 'center_embedding == yes' stims are comprised of exactly one sentence;
        try:
            num_sentences = len(stim.parsed_policy)
            if len(stim.parsed_policy) == 1:
                print("✓ consists of exactly one sentence")
            else: 
                print("X consists of more or fewer than one sentence!")
        except: 
            print("ERROR: can't count # of parsed sentences.")
        
        # ... that the first sentence has a root w/ label S;
        try: 
            root_label = stim.parsed_policy[0].children[0].label
            if root_label == 'S':
                print("✓ node directly below ROOT is of type S")
            else: 
                print("X node directly below ROOT is of wrong type: {} (should be S)".format(root_label))
        except:
            print("ERROR: can't determine label of node directly below ROOT")

        # ... that the node directly below ROOT branches into NP and VP;
        try:
            root_S_daughter1 = stim.parsed_policy[0].children[0].children[0].label
            root_S_daughter2 = stim.parsed_policy[0].children[0].children[1].label
            if root_S_daughter1 == 'NP' and root_S_daughter2 == 'VP' and len(stim.parsed_policy[0].children[0].children) == 2:
                print("✓ node directly below ROOT has exactly 2 daughters: NP and VP")
            else:
                print("X node below ROOT should have exactly 2 daughters of type NP and VP but does not")
        except: 
            print("ERROR: can't determine labels for daughters of node directly below ROOT")
            
        # ... that left daughter of root S branches into NP and SBAR; 
        try: 
            matrix_NP_daughter1 = stim.parsed_policy[0].children[0].children[0].children[0].label
            matrix_NP_daughter2 = stim.parsed_policy[0].children[0].children[0].children[1].label
            if matrix_NP_daughter1 == 'NP' and matrix_NP_daughter2 == "SBAR" and len(stim.parsed_policy[0].children[0].children[0].children) == 2: 
                print('✓ left daughter of node directly below ROOT branches into NP and SBAR')
            else:
                print('X left daughter of node directly below ROOT should branch into NP and SBAR but does not')
        except: 
             print("ERROR: can't determine labels for the (daughters of the (left daughter of root S))")
            
         # that within the highest SBAR, there is a second right-embedded SBAR at expected position (5 layers down from highest SBAR)
        try: 
            daughter_5_below_SBAR = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].label
            if daughter_5_below_SBAR == "SBAR":
                print("✓ 2nd right-embedded SBAR at expected position (5 layers below highest SBAR)")
            else: 
                print("X no 2nd right-embedded SBAR at expected position (5 layers below highest SBAR). Instead label is: {}".format(daughter_5_below_SBAR))
        except:
            print("ERROR: can't locate a node corresponding to expected position of 2nd embedded SBAR (5 layers below highest SBAR)")
            
    else:
        # confirm that 'center_embedding == no' stims are comprised of exactly two sentences;
        try:
            num_sentences = len(stim.parsed_policy)
            if len(stim.parsed_policy) == 2:
                print("✓ consists of exactly two sentences")
            else: 
                print("X consists of more or fewer than two sentences!")
        except: 
            print("ERROR: can't count # of parsed sentences.")
    
         # ... that the first sentence has a root w/ label S;
        try: 
            root_label = stim.parsed_policy[0].children[0].label
            if root_label == 'S':
                print("✓ first sentence's node directly below ROOT is of type S")
            else: 
                print("X first sentence's node directly below ROOT is of wrong type: {} (should be S)".format(root_label))
        except:
            print("ERROR: can't determine label of node directly below ROOT for first sentence") 
    
         # ... that the second sentence has a root w/ label S;
        try: 
            root_label = stim.parsed_policy[1].children[0].label
            if root_label == 'S':
                print("✓ second sentence's node directly below ROOT is of type S")
            else: 
                print("X second sentence's node directly below ROOT is of wrong type: {} (should be S)".format(root_label))
        except:
            print("ERROR: can't determine label of node directly below ROOT for second sentence") 
            
        # ... that for the first sentence, the node directly below ROOT branches into NP and VP;
        try:
            root_S_daughter1 = stim.parsed_policy[0].children[0].children[0].label
            root_S_daughter2 = stim.parsed_policy[0].children[0].children[1].label
            if root_S_daughter1 == 'NP' and root_S_daughter2 == 'VP' and len(stim.parsed_policy[0].children[0].children) == 2:
                print("✓ first sentence's node directly below ROOT has exactly 2 daughters: NP and VP")
            else:
                print("X first sentence's node below ROOT should have exactly 2 daughters of type NP and VP but does not")
        except: 
            print("ERROR: can't determine labels for daughters of node directly below ROOT for first sentence")
        
        # ... that for the second sentence, the node directly below ROOT branches into NP and VP;
        try:
            root_S_daughter1 = stim.parsed_policy[1].children[0].children[0].label
            root_S_daughter2 = stim.parsed_policy[1].children[0].children[1].label
            if root_S_daughter1 == 'NP' and root_S_daughter2 == 'VP' and len(stim.parsed_policy[0].children[0].children) == 2:
                print("✓ second sentence's node directly below ROOT has exactly 2 daughters: NP and VP")
            else:
                print("X second sentence's node below ROOT should have exactly 2 daughters of type NP and VP but does not")
        except: 
            print("ERROR: can't determine labels for daughters of node directly below ROOT for second sentence")
            
         # ... that for the first sentence, the left daughter of root S branches into DT and NN; 
        try: 
            matrix_NP_daughter1 = stim.parsed_policy[0].children[0].children[0].children[0].label
            matrix_NP_daughter2 = stim.parsed_policy[0].children[0].children[0].children[1].label
            if matrix_NP_daughter1 == 'DT' and matrix_NP_daughter2 == "NN" and len(stim.parsed_policy[0].children[0].children[0].children) == 2: 
                print('✓ for first sentence, left daughter of node directly below ROOT branches into DT and NN')
            else:
                print('X for first sentence, left daughter of node directly below ROOT should branch into DT and NN but does not')
        except: 
             print("ERROR: for first sentence, can't determine labels for the (daughters of the (left daughter of root S))")
                
         # ... that for the second sentence, the left daughter of root S branches into DT and NN; 
        try: 
            matrix_NP_daughter1 = stim.parsed_policy[1].children[0].children[0].children[0].label
            matrix_NP_daughter2 = stim.parsed_policy[1].children[0].children[0].children[1].label
            if matrix_NP_daughter1 == 'DT' and matrix_NP_daughter2 == "NN" and len(stim.parsed_policy[0].children[0].children[0].children) == 2: 
                print('✓ for second sentence, left daughter of node directly below ROOT branches into DT and NN')
            else:
                print('X for second sentence, left daughter of node directly below ROOT should branch into DT and NN but does not')
        except: 
             print("ERROR: for second sentence, can't determine labels for the (daughters of the (left daughter of root S))")
                
         # ... that for the first sentence, the right daughter of root S branches into VBZ and PP; 
        try: 
            matrix_NP_daughter1 = stim.parsed_policy[0].children[0].children[1].children[0].label
            matrix_NP_daughter2 = stim.parsed_policy[0].children[0].children[1].children[1].label
            if matrix_NP_daughter1 == 'VBZ' and matrix_NP_daughter2 == "PP" and len(stim.parsed_policy[0].children[0].children[0].children) == 2: 
                print('✓ for first sentence, right daughter of node directly below ROOT branches into VBZ and PP')
            else:
                print('X for first sentence, right daughter of node directly below ROOT should branch into VBZ and PP but does not')
        except: 
             print("ERROR: for first sentence, can't determine labels for the (daughters of the (right daughter of root S))")
                
        # ... that for the second sentence, the right daughter of root S branches into VBZ and PP; 
        try: 
            matrix_NP_daughter1 = stim.parsed_policy[1].children[0].children[1].children[0].label
            matrix_NP_daughter2 = stim.parsed_policy[1].children[0].children[1].children[1].label
            if matrix_NP_daughter1 == 'VBZ' and matrix_NP_daughter2 == "PP" and len(stim.parsed_policy[0].children[0].children[0].children) == 2: 
                print('✓ for second sentence, right daughter of node directly below ROOT branches into VBZ and PP')
            else:
                print('X for second sentence, right daughter of node directly below ROOT should branch into VBZ and PP but does not')
        except: 
             print("ERROR: for second sentence, can't determine labels for the (daughters of the (right daughter of root S))")
                
         # ... that within the first sentence, there is a right-embedded SBAR at expected position (5 layers down from highest SBAR)
        try: 
            rightmost_daughter_5_below_ROOT = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].label
            if rightmost_daughter_5_below_ROOT == "SBAR":
                print("✓ for first sentence, right-embedded SBAR at expected position (5 layers below ROOT)")
            else: 
                print("X for first sentence, no right-embedded SBAR at expected position (5 layers below ROOT). Instead label is: {}".format(daughter_5_below_SBAR))
        except:
            print("ERROR: for first sentence, can't locate a node corresponding to expected position of embedded SBAR (5 layers below ROOT)")
 
    if stim.passive == "yes" and stim.center_embedding == "no":
        # confirm that for 'passive == yes', 'center_embedding == no' stims, first sentence has two daughters of type VBZ|VBP (3rd pers. sg. present | 3rd pers. pl. present) and VP at expected position within right-embedded SBAR
        try: 
            VP_daughter_within_SBAR_1 = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].children[1].children[0].children[0].label
            VP_daughter_within_SBAR_2 = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].children[1].children[0].children[1].label
            if (VP_daughter_within_SBAR_1 == "VBZ" or VP_daughter_within_SBAR_1 == "VBP") and VP_daughter_within_SBAR_2 == "VP":
                print("✓ for first sentence, daughters VBZ|VBP and VP at expected position (under right daughter of embedded SBAR)")
            else: 
                print("X for first sentence, no VBZ|VBP and VP daughter labels at expected position (under right daughter of embedded SBAR). Labels are currently {} and {}".format(VP_daughter_within_SBAR_1, VP_daughter_within_SBAR_2))
        except: 
            print("ERROR: for first sentence, can't determine labels under right daughter of node that should correspond to embedded SBAR")

        # ... and that within aforementioned VP, we have daughters VBN (past particip.) and PP
        try: 
            passiveVP_daughter_within_SBAR_1 = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].children[1].children[0].children[1].children[0].label
            passiveVP_daughter_within_SBAR_2 = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].children[1].children[0].children[1].children[1].label
            if passiveVP_daughter_within_SBAR_1 == "VBN" and passiveVP_daughter_within_SBAR_2 == "PP":
                print("✓ for first sentence, daughters VBN and PP at expected position (under where we expect VP daughter of embedded SBAR)")
            else: 
                print("X for first sentence, no VBN and PP daughter labels at expected position (under where we expect VP daugther of embedded SBAR). Labels are currently {} and {}".format(passiveVP_daughter_within_SBAR_1, passiveVP_daughter_within_SBAR_2))
        except: 
            print("ERROR: for first sentence, can't determine labels under right daughter of node that should correspond to VP daughter of embedded SBAR")

        # ... and that aforementioned PP is headed w/ "by"
        try: 
            passiveVP_daughter_within_SBAR_2_word = stim.parsed_policy[0].children[0].children[1].children[1].children[1].children[1].children[1].children[0].children[1].children[1].children[0].children[0]
            if str(passiveVP_daughter_within_SBAR_2_word) == "by":
                print("✓ node where we expect aforementioned PP is headed w/ 'by'")
            else: 
                print("X node where we expect aforementioned PP is not headed w/ 'by' (instead {})".format(str(passiveVP_daughter_within_SBAR_2_word)))
        except:
            print("ERROR: can't determine head of node where we expect aforementioned PP")

    elif stim.passive == "yes" and stim.center_embedding == "yes":
        # confirm that for 'passive == yes', 'center_embedding == yes' stims, first sentence has two daughters of type VBZ|VBP (3rd pers. sg. present | 3rd pers. pl. present) and VP at expected position within right-embedded SBAR
        try: 
            VP_daughter_within_SBAR_1 = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].children[1].children[0].children[0].label
            VP_daughter_within_SBAR_2 = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].children[1].children[0].children[1].label
            if (VP_daughter_within_SBAR_1 == "VBZ" or VP_daughter_within_SBAR_1 == "VBP") and VP_daughter_within_SBAR_2 == "VP":
                print("✓ 2 daughters, VBZ|VBP and VP, at expected position (under right daughter of embedded SBAR)")
            else: 
                print("X no VBZ|VBP and VP daughter labels at expected position (under right daughter of embedded SBAR). Labels are currently {} and {}".format(VP_daughter_within_SBAR_1, VP_daughter_within_SBAR_2))
        except: 
            print("ERROR: can't determine labels under right daughter of node that should correspond to embedded SBAR")

         # ... and that within aforementioned VP, we have daughters VBN (past particip.) and PP
        try: 
            passiveVP_daughter_within_SBAR_1 = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].children[1].children[0].children[1].children[0].label
            passiveVP_daughter_within_SBAR_2 = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].children[1].children[0].children[1].children[1].label
            if passiveVP_daughter_within_SBAR_1 == "VBN" and passiveVP_daughter_within_SBAR_2 == "PP":
                print("✓ daughters of type VBN and PP at expected position (under where we expect VP daughter of embedded SBAR)")
            else: 
                print("X no VBN and PP daughter labels at expected position (under where we expect VP daugther of embedded SBAR). Labels are currently {} and {}".format(passiveVP_daughter_within_SBAR_1, passiveVP_daughter_within_SBAR_2))
        except: 
            print("ERROR: for first sentence, can't determine labels under right daughter of node that should correspond to VP daughter of embedded SBAR")
            
         # ... and that aforementioned PP is headed w/ "by"
        try: 
            passiveVP_daughter_within_SBAR_2_word = stim.parsed_policy[0].children[0].children[0].children[1].children[1].children[0].children[1].children[1].children[1].children[1].children[0].children[1].children[1].children[0].children[0]
            if str(passiveVP_daughter_within_SBAR_2_word) == "by":
                print("✓ node where we expect aforementioned PP is headed w/ 'by'")
            else: 
                print("X node where we expect aforementioned PP is not headed w/ 'by' (instead {})".format(str(passiveVP_daughter_within_SBAR_2_word)))
        except:
            print("ERROR: can't determine head of node where we expect aforementioned PP")
           
stims.apply(check_parse, axis=1)

---------
Item: Escape Of Oil * center_embedding: yes * passive: yes
✓ consists of exactly one sentence
✓ node directly below ROOT is of type S
✓ node directly below ROOT has exactly 2 daughters: NP and VP
✓ left daughter of node directly below ROOT branches into NP and SBAR
✓ 2nd right-embedded SBAR at expected position (5 layers below highest SBAR)
✓ 2 daughters, VBZ|VBP and VP, at expected position (under right daughter of embedded SBAR)
✓ daughters of type VBN and PP at expected position (under where we expect VP daughter of embedded SBAR)
✓ node where we expect aforementioned PP is headed w/ 'by'
---------
Item: Escape Of Oil * center_embedding: yes * passive: no
✓ consists of exactly one sentence
✓ node directly below ROOT is of type S
✓ node directly below ROOT has exactly 2 daughters: NP and VP
✓ left daughter of node directly below ROOT branches into NP and SBAR
✓ 2nd right-embedded SBAR at expected position (5 layers below highest SBAR)
---------
Item: Escape Of Oil * center_

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
40    None
41    None
42    None
43    None
44    None
45    None
46    None
47    None
48    None
49    None
dtype: object