In [2]:
from nltk import PCFG, Nonterminal
from nltk.parse.generate import generate

def equal_production(terminals, total=1):
    """Shorthand to write a list of terminals that are all equally likely"""
    terminals = terminals.split(' | ')
    return ' | '.join([f"'{terminal}' [{total/len(terminals)}]" for terminal in terminals])

equal_production('man | woman | girl')

"'man' [0.3333333333333333] | 'woman' [0.3333333333333333] | 'girl' [0.3333333333333333]"

In [3]:
import random

# Randomly generate sentences using CFG
def weighted_choice(choices):
    total = sum(w for c, w in choices)
    r = random.uniform(0, total)
    upto = 0
    for c, w in choices:
        if upto + w >= r:
            return c
        upto += w

def generate_sentence(grammar, symbol=Nonterminal('S')):
    productions = grammar.productions(lhs=symbol)
    chosen_prod = weighted_choice([(prod, prod.prob()) for prod in productions])
    
    sentence = []
    # print(symbol)
    for sym in chosen_prod.rhs():
        if isinstance(sym, Nonterminal):
            sentence.extend(generate_sentence(grammar, sym))
        else:
            sentence.append(sym)
            
    return sentence

# Morphology
from pyfoma import *

fsts = {}
fsts['lex'] = FST.re("[a-zA-Z\+]*")

fsts['sib']       = FST.re("s|sh|z|zh|ch|x")
fsts['C']         = FST.re("[a-z] - [aeiou]")
fsts['sibrk']     = FST.re("$^rewrite('':e / $sib _ \+ s)", fsts)
fsts['yrule']     = FST.re("$^rewrite(y:(ie) / $C _ \+ s)", fsts)
fsts['cleanup']   = FST.re("$^rewrite(\+:'')")
fsts['grammar']   = FST.re("$lex @ $sibrk @ $yrule @ $cleanup", fsts)

def fix_morphology(words):
    """Combines morpheme clusters into proper words using an FST"""
    combined_words = []
    for word in words:
        if word[0] == "+":
            combined_words[-1] += word
        else:
            combined_words.append(word)
    return [list(fsts['grammar'].generate(word))[0] for word in combined_words]

def sample_sentences(grammar, n):
    sents = [generate_sentence(grammar) for _ in range(n)]
    sents = [' '.join(fix_morphology(sent)) for sent in sents]
    # sents = [sent.capitalize() + "." for sent in sents]
    return sents

In [4]:
pcfg = PCFG.fromstring(f"""
S             -> NP_3Sg_nom VP_3Sg [0.5] | NP_nom VP [0.5]

VP_3Sg        -> VT '+s' NP_acc [0.475] | VI '+s' [0.475] | VP_3Sg 'and' VP_3Sg [0.05]
VP            -> VT      NP_acc [0.475] | VI      [0.475] | VP     'and' VP     [0.05]

NP_3Sg_nom    -> 'he' [0.25] | 'she' [0.25] | NP_common_Sg [0.5]
NP_common_Sg  -> Det_Sg N_bar_common_Sg [1]
Det_Sg        -> {equal_production('the | a')}

NP_nom        -> {equal_production('I | you | we | they', total=0.5)} | NP_common_Pl [0.5]
NP_common_Pl  -> Det_Pl N_bar_common_Pl [0.8] | NP_common_Pl 'and' NP_common_Pl [0.2]
Det_Pl        -> {equal_production('the | those | these')}

NP_acc        -> {equal_production('me | you | us | them', total=0.30)} | NP_common_Pl [0.35] | NP_common_Sg [0.35]

N_bar_common_Sg  -> Adj N_bar_common_Sg [0.2] | N_bar_common_Sg Rel_Sg [0.15] | N_common [0.65]
N_bar_common_Pl  -> Adj N_bar_common_Pl [0.2] | N_bar_common_Pl Rel_Pl [0.15] | N_common '+s' [0.65]
N_common      -> {equal_production('girl | boy | cat | turtle | asparagus | duck | cheese | dude | rabbit | wug | linguist | physicist | lady | dog | cat | bird')}

Rel_Sg         -> 'that' VP_3Sg [1]
Rel_Pl         -> 'that' VP [1]

VI            -> {equal_production('run | walk | think | laugh | ponder')}
VT            -> {equal_production('kick | kiss | hug | punch | fight | love')}

Adj           -> {equal_production('big | small | happy | mad | red | blue | sparkling | shiny')}
""")

sample_sentences(pcfg, 20)

['he loves those happy dudes',
 'we ponder',
 'she kicks them',
 'those girls punch a turtle',
 'we love a wug',
 'the cat laughs',
 'these mad ladies love us',
 'a dog thinks',
 'we hug you',
 'I think',
 'those cats kick the rabbits',
 'she fights me',
 'she fights me',
 'these physicists fight these cats',
 'a small asparagus that hugs a cat and ponders kisses us',
 'the asparaguses laugh',
 'you ponder',
 'those rabbits ponder',
 'a duck fights them',
 'the turtle that fights the boys punches the bird']

In [5]:
agreement_violations = PCFG.fromstring(f"""
S             -> NP_3Sg_nom VP_3Sg [0.5] | NP_nom VP [0.5]

VP_3Sg        -> VT      NP_acc [0.475] | VI      [0.475] | VP_3Sg 'and' VP_3Sg [0.05]
VP            -> VT '+s' NP_acc [0.475] | VI '+s' [0.475] | VP     'and' VP     [0.05]

NP_3Sg_nom    -> 'he' [0.25] | 'she' [0.25] | NP_common_Sg [0.5]
NP_common_Sg  -> Det_Sg N_bar_common_Sg [1]
Det_Sg        -> {equal_production('the | a')}

NP_nom        -> {equal_production('I | you | we | they', total=0.5)} | NP_common_Pl [0.5]
NP_common_Pl  -> Det_Pl N_bar_common_Pl [0.8] | NP_common_Pl 'and' NP_common_Pl [0.2]
Det_Pl        -> {equal_production('the | those | these')}

NP_acc        -> {equal_production('me | you | us | them', total=0.30)} | NP_common_Pl [0.35] | NP_common_Sg [0.35]

N_bar_common_Sg  -> Adj N_bar_common_Sg [0.2] | N_bar_common_Sg Rel_Sg [0.15] | N_common [0.65]
N_bar_common_Pl  -> Adj N_bar_common_Pl [0.2] | N_bar_common_Pl Rel_Pl [0.15] | N_common '+s' [0.65]
N_common      -> {equal_production('girl | boy | cat | turtle | rutabaga | duck | cheese | dude | rabbit | wug | linguist | physicist | lady | dog | cat | bird')}

Rel_Sg         -> 'that' VP_3Sg [1]
Rel_Pl         -> 'that' VP [1]

VI            -> {equal_production('run | walk | think | laugh | ponder')}
VT            -> {equal_production('kick | kiss | hug | punch | fight | love')}

Adj           -> {equal_production('big | small | happy | mad | red | blue | sparkling | shiny')}
""")

sample_sentences(agreement_violations, 20)

['they kisses us',
 'we kisses these dogs',
 'she kick a dog',
 'the small ducks that loves a girl laughs',
 'I punches them',
 'they punches them',
 'those boys thinks',
 'those dogs walks',
 'the physicists fights these ducks',
 'she run',
 'the wugs thinks',
 'those rutabagas hugs them and laughs',
 'she think',
 'the wugs fights the shiny small big linguists',
 'those rutabagas walks',
 'I walks',
 'he run',
 'he walk',
 'the cheeses fights me and ponders and ponders',
 'these rutabagas ponders']

In [20]:
import datasets

valid_num = 2000
invalid_num = 200
valid = sample_sentences(pcfg, valid_num)
invalid = sample_sentences(agreement_violations, invalid_num)

dataset = datasets.Dataset.from_dict({"text": valid + invalid, "label": [1] * valid_num + [0] * invalid_num}).shuffle()
dataset = dataset.train_test_split(test_size=0.3)
dataset.push_to_hub("michaelginn/latent-trees-agreement")

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Downloading metadata:   0%|          | 0.00/481 [00:00<?, ?B/s]