In [1]:
import stanza
import pandas as pd
import re
import unicodedata
from nltk.tree import Tree
from nltk.draw.tree import TreeView
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# https://stanfordnlp.github.io/stanza/constituency.html
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')

2023-08-04 11:17:14 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 3.22MB/s]
2023-08-04 11:17:15 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-08-04 11:17:15 INFO: Using device: cpu
2023-08-04 11:17:15 INFO: Loading: tokenize
2023-08-04 11:17:15 INFO: Loading: pos
2023-08-04 11:17:15 INFO: Loading: constituency
2023-08-04 11:17:16 INFO: Done loading processors!


In [3]:
# Prior to constituency parse norming: read in stim CSV. Drop 'version' and 'continuation', and 'locus of uncertainty ' columns and drop duplicates.
# There should be exactly four rows per 'item' type - if more, then something is amiss. (Possibly inconsistent/incorrect "header" somewhere within item set). 
stims = pd.read_csv("../../experiments/2_falseconsensus_syntax/experiment/stimuli.csv").drop(['version', 'continuation', 'locus of uncertainty '], axis = 1).drop_duplicates()
# Notice that there are 5 rows for 'Escape of Oil' where there should be 4:
stims

Unnamed: 0,item,center_embedding,passive,header
0,Escape Of Oil,yes,yes,"Mariam has insurance that covers ""Escape of Oi..."
1,Escape Of Oil,yes,no,"Mariam has insurance that covers ""Escape of Oi..."
2,Escape Of Oil,no,no,"Mariam has insurance that covers ""Escape of Oi..."
3,Escape Of Oil,no,yes,"Mariam has insurance that covers ""Escape of Oi..."
7,Escape Of Oil,no,no,"Mariam has insurance that covers ""Escape of Oi..."
12,Vehicle Glass,yes,yes,"Aneesh has insurance that covers ""Vehicle Glas..."
13,Vehicle Glass,yes,no,"Aneesh has insurance that covers ""Vehicle Glas..."
14,Vehicle Glass,no,no,"Aneesh has insurance that covers ""Vehicle Glas..."
15,Vehicle Glass,no,yes,"Aneesh has insurance that covers ""Vehicle Glas..."
24,Emergency Damages,yes,yes,"Joanne has insurance that covers ""Emergency Da..."


In [4]:
# Step 1: grep for policy wording (all and only the text between the <b> and </b> tags in 'header')
stims['policy'] = stims['header'].map(lambda x : 
                                      re.findall(r'<b>(.*?)</b>', unicodedata.normalize("NFKD", x))[0])

In [5]:
# Step 2: constituency parse the policy wording 
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency')
stims['parsed_policy'] = stims['policy'].map(lambda x : 
                                            [i.constituency for i in nlp(x.replace('"', " ").replace(".","")).sentences])

2023-08-04 11:17:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json: 216kB [00:00, 2.26MB/s]
2023-08-04 11:17:22 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-08-04 11:17:22 INFO: Using device: cpu
2023-08-04 11:17:22 INFO: Loading: tokenize
2023-08-04 11:17:22 INFO: Loading: pos
2023-08-04 11:17:22 INFO: Loading: constituency
2023-08-04 11:17:22 INFO: Done loading processors!


In [6]:
stims['parsed_policy'].head()

0    [(ROOT (S (NP (NP (DT An) (NN incident)) (SBAR...
1    [(ROOT (S (NP (NP (DT An) (NN incident)) (SBAR...
2    [(ROOT (S (NP (DT An) (NN incident)) (VP (VBZ ...
3    [(ROOT (S (NP (DT An) (NN incident)) (VP (VBZ ...
7    [(ROOT (S (NP (DT An) (NN incident)) (VP (VBZ ...
Name: parsed_policy, dtype: object

In [7]:
# Step 3: render and save the constituency parses 
stims = stims.rename({"item ": "item"},axis='columns')
def render_parse(stim_row): 
    for i in range(len(stim_row['parsed_policy'])): 
        filename = "constituency-parses/" + stim_row['item'].replace(" ","") + "_" + stim_row['center_embedding'] + "cembed_" + stim_row['passive'] + "passive_" + str(i + 1)       
        tree = stim_row['parsed_policy'][i]
        TreeView(Tree.fromstring(str(tree)))._cframe.print_to_file(filename+".ps")
        os.system('convert {}.ps {}.png'.format(filename,filename))
        os.system('rm {}.ps'.format(filename))
        print(filename)

stims.apply(render_parse, axis=1)

constituency-parses/EscapeOfOil_yescembed_yespassive_1
constituency-parses/EscapeOfOil_yescembed_nopassive_1
constituency-parses/EscapeOfOil_nocembed_nopassive_1
constituency-parses/EscapeOfOil_nocembed_nopassive_2
constituency-parses/EscapeOfOil_nocembed_yespassive_1
constituency-parses/EscapeOfOil_nocembed_yespassive_2
constituency-parses/EscapeOfOil_nocembed_nopassive_1
constituency-parses/EscapeOfOil_nocembed_nopassive_2
constituency-parses/VehicleGlass_yescembed_yespassive_1
constituency-parses/VehicleGlass_yescembed_nopassive_1
constituency-parses/VehicleGlass_nocembed_nopassive_1
constituency-parses/VehicleGlass_nocembed_nopassive_2
constituency-parses/VehicleGlass_nocembed_yespassive_1
constituency-parses/VehicleGlass_nocembed_yespassive_2
constituency-parses/EmergencyDamages_yescembed_yespassive_1
constituency-parses/EmergencyDamages_yescembed_nopassive_1
constituency-parses/EmergencyDamages_nocembed_nopassive_1
constituency-parses/EmergencyDamages_nocembed_nopassive_2
constit

0      None
1      None
2      None
3      None
7      None
12     None
13     None
14     None
15     None
24     None
25     None
26     None
27     None
36     None
37     None
38     None
39     None
48     None
49     None
50     None
51     None
60     None
61     None
62     None
63     None
72     None
73     None
74     None
75     None
84     None
85     None
86     None
87     None
96     None
97     None
98     None
99     None
108    None
109    None
110    None
111    None
120    None
121    None
122    None
123    None
132    None
133    None
134    None
135    None
144    None
145    None
dtype: object

In [None]:
# Step 4: confirm that the constituency parses conform to some set of criteria 