In [18]:
import spacy
import textacy
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

nlp = spacy.load("en_core_web_lg")

# BLiMP

In [6]:
target_path = Path("data/blimp_with_targets")
target_path.mkdir(parents=True, exist_ok=True)

In [36]:
file_names = ["anaphor_gender_agreement", "anaphor_number_agreement"]

for file_name in file_names:
    content = pd.read_json(
        path_or_buf=f"data/blimp/{file_name}.jsonl",
        lines=True
    )

    targets, targets_phrases = [], []
    for row in content.itertuples():
        
        doc = nlp(row.sentence_good)
        
        for token in doc:
            if token.text == row.one_prefix_word_good:
                target_token = token

        target = []
        for cluster in doc._.coref_clusters:
            for mention in cluster:
                if mention.root.text == target_token.text:
                    target = [token for token in cluster if token.text != target_token.text]
                    break
                
        if len(target) == 1:
            target = target[0]
            targets.append(target.root.text)
            targets_phrases.append(target.text)
        else:
            targets.append("NA")
            targets_phrases.append("NA")
            
        
    content["target"] = targets
    content["target_phrase"] = targets_phrases

    content.to_csv(target_path / f"{file_name}.csv")

In [37]:
# main verb
# Argument Structure (asp)

file_name = "animate_subject_passive"

content = pd.read_json(
    path_or_buf=f"data/blimp/{file_name}.jsonl",
    lines=True
)

content["target"] = [
    list(textacy.extract.matches.token_matches(
        nlp(sent),
        patterns=[{"POS": "VERB"}]
    ))[0][0]
    for sent in content.sentence_good
]

content.to_csv(target_path / f"{file_name}.csv")

In [35]:
# det noun
# Determiner-Noun Agreement

file_names = [
    "determiner_noun_agreement_1", "determiner_noun_agreement_irregular_1",
    "determiner_noun_agreement_with_adjective_1", "determiner_noun_agreement_with_adj_irregular_1"]

for file_name in file_names:
    content = pd.read_json(
        path_or_buf=f"data/blimp/{file_name}.jsonl",
        lines=True
    )
    
    targets = []
    
    for row in content.itertuples():
        
        tokenized_sent = row.sentence_good.split(" ")     
        for token in ["this", "that", "those", "these"]:
            if token in tokenized_sent:
                targets.append(token)
                break

    assert len(targets) == len(content)
    
    content["target"] = targets
    
    content.to_csv(target_path / f"{file_name}.csv")

In [36]:
# npi
# NPI Licensing

file_name = "npi_present_1"

content = pd.read_json(
    path_or_buf=f"data/blimp/{file_name}.jsonl",
    lines=True
)

content["target"] = "Even"

content.to_csv(target_path / f"{file_name}.csv")

# Subject Verb Agreement (SVA and darn)

In [2]:
target_path = Path("data/sva_with_targets")
target_path.mkdir(parents=True, exist_ok=True)


In [9]:
def get_subject_phrase(doc):
    for token in doc:
        if ("subj" in token.dep_):
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

In [3]:
# subj verb
# Subject-Verb Agreement

from extract_explanations import read_sva_dataset

dataset = 'sva'
        
file_names = ["distractor_agreement_relational_noun"]

file_names = ['lgd_dataset']

for file_name in file_names:
    
    if dataset == 'blimp':
        content = pd.read_json(
            path_or_buf=f"data/{dataset}/{file_name}.jsonl",
            lines=True
        )
    else:
        content = pd.read_csv('./data/sva/lgd_dataset.tsv', sep='\t', names=["num_attractors", "sentence_good", "one_prefix_prefix", "one_prefix_word_good", "one_prefix_word_bad"])
    
    targets, targets_phrases = [], []
    for sent in tqdm(content.sentence_good):
        doc = nlp(sent)
        
        subject_phrase = get_subject_phrase(doc)
        
        if subject_phrase:
            targets.append(subject_phrase.root.text)
            targets_phrases.append(subject_phrase.text)
        else:
            targets.append("NA")
            targets_phrases.append("NA")
        
    content["target"] = targets
    content["target_phrase"] = targets_phrases

    
    content.to_csv(target_path / f"{file_name}.csv")

100%|██████████| 29985/29985 [04:01<00:00, 124.14it/s]


In [16]:
content = pd.read_csv('./data/sva/lgd_dataset.tsv', sep='\t', names=["num_attractors", "sentence_good", "one_prefix_prefix", "one_prefix_word_good", "one_prefix_word_bad"])
content[content.num_attractors == 1]

Unnamed: 0,num_attractors,sentence_good,one_prefix_prefix,one_prefix_word_good,one_prefix_word_bad
0,1,a 12th-century commentary on periegetes by eus...,a 12th-century commentary on periegetes by eus...,compares,compare
1,1,"a 13-year boy named toby lolness , who is just...","a 13-year boy named toby lolness , who is just...",lives,live
2,1,"a 16-year-old , second-year high school studen...","a 16-year-old , second-year high school studen...",resembles,resemble
3,1,a 1770s map of philadelphia 's naval defenses ...,a 1770s map of philadelphia 's naval defenses ...,shows,show
4,1,"a 1794 plan of the 'castle ' exists , this onl...","a 1794 plan of the 'castle ' exists , this onl...",shows,show
...,...,...,...,...,...
29977,1,"zinc-finger genes , particularly those that in...","zinc-finger genes , particularly those that in...",exist,exists
29979,1,"zip file format for all users , the appnote pr...","zip file format for all users , the appnote **...",provides,provide
29980,1,"zirconium powder can cause irritation , but on...","zirconium powder can cause irritation , but on...",requires,require
29981,1,zöllner 's illusion and the café wall illusion...,zöllner 's illusion and the café wall illusion...,causes,cause


# IOI

In [None]:
# IOI
from datasets import load_dataset
import numpy as np

target_path = Path("data/ioi_with_targets")
target_path.mkdir(parents=True, exist_ok=True)

# Load IOI dataset from HuggingFace
dataset = load_dataset("fahamu/ioi")
content = dataset['train'].select(np.arange(0,1000))
file_name = 'ioi_dataset'


In [5]:
proper_names = []
targets = []
# Prefix
one_prefix_prefix = []
one_prefix_word_good = []
one_prefix_word_bad = []

content = pd.DataFrame(content)
for counter, sent in enumerate(tqdm(content['ioi_sentences'])):
    doc = nlp(sent)
    one_prefix_prefix.append(' '.join(sent.split()[:-1]))
    one_prefix_word_good.append(sent.split()[-1])
    # Last word is target
    targets.append(sent.split()[-1])

    first_word = doc[0].text
    last_name = doc[-1].text
    
    for token in doc:
        
        if token.pos_ == "PROPN":
            if token.text != first_word and token.text != last_name:
                one_prefix_word_bad.append(token.text)
                break

content.rename(columns={'ioi_sentences': 'Sentence_good'}, inplace=True)
content["one_prefix_prefix"] = one_prefix_prefix
content["one_prefix_word_good"] = one_prefix_word_good
content["one_prefix_word_bad"] = one_prefix_word_bad
content["target"] = targets


content.to_csv(target_path / f"{file_name}.csv")

100%|███████████████████████████████████████| 1000/1000 [00:11<00:00, 90.00it/s]


In [7]:
content

Unnamed: 0,Sentence_good,one_prefix_prefix,one_prefix_word_good,one_prefix_word_bad,target
0,Friends Juana and Kristi found a mango at the ...,Friends Juana and Kristi found a mango at the ...,Juana,Kristi,Juana
1,"Then, Yvette and Angie were working at the mou...","Then, Yvette and Angie were working at the mou...",Angie,Yvette,Angie
2,"After Doris and Marsha went to the mountain, M...","After Doris and Marsha went to the mountain, M...",Doris,Marsha,Doris
3,While Bernadette and Harriet were commuting to...,While Bernadette and Harriet were commuting to...,Harriet,Bernadette,Harriet
4,"Afterwards, Ginger and Bernadette went to the ...","Afterwards, Ginger and Bernadette went to the ...",Ginger,Bernadette,Ginger
...,...,...,...,...,...
995,"Afterwards, Gwen and Chelsea went to the beach...","Afterwards, Gwen and Chelsea went to the beach...",Chelsea,Gwen,Chelsea
996,The river Amanda and Carol went to had a canta...,The river Amanda and Carol went to had a canta...,Amanda,Carol,Amanda
997,While Sonia and Marian were commuting to the t...,While Sonia and Marian were commuting to the t...,Marian,Sonia,Marian
998,The store Janet and Leigh went to had a apple....,The store Janet and Leigh went to had a apple....,Janet,Leigh,Janet
