In [2]:
import itertools
import sys
from nltk.grammar import Nonterminal

In [84]:
def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError(
                    "The grammar has rule(s) that yield infinite recursion!!"
                )
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth - 1):
                    yield frag
        else:
            yield [item]

O =   ["name", "date of birth", "first name", "last name", "amount owed", "address", "registration number", "company number"]
Os = (" [{0}] | ".format(1 / len(O))).join(["'{0}'".format(o) for o in O])            
Os += " [{0}] ".format(1 / len(O))

Det = ["the"] 
Dets = "'{0}' [1.0]".format(Det[0])

Conj = ["and"]
Conjs = "'{0}' [1.0]".format(Conj[0])

Comm = [","]
Comms = "'{0}' [1.0]".format(Comm[0])

Pos = ["of", "to"]
Poss = (" [{0}] | ".format(1 / len(Pos))).join(["'{0}'".format(p) for p in Pos])            
Poss += " [{0}] ".format(1 / len(Pos))

O2 = ["employee", "customer", "client", "company", "payee", "recipient"]
O2s = (" [{0}] | ".format(1 / len(O2))).join(["'{0}'".format(o2) for o2 in O2])            
O2s += " [{0}] ".format(1 / len(O2))

question_grammar = """
  S -> QP T [1.0]
  QP -> Q PR [1.0]
  PR -> Det OP [0.333] | Det O Conj O Pos Det O2 [0.333] | Det O Comm O Conj O Pos Det O2 [0.333]
  OP -> O Pos Det O2 [0.5] | O2 Ap O [0.5]
  Q -> 'what is' [1]
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  O -> """ + Os + """
  Pos -> """ + Poss + """
  Ap -> 's' [1.0]
  O2 -> """ + O2s + """
  T -> '?' [1.0]
"""
question_grammar



answer_grammar = """
  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]
  O2 -> """ + O2s + """
  V -> 'is' [1.0]
  SL -> '{SLOT}' [1.0]
  Ap -> 's' [1.0]
  O -> """ + Os + """
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  Pos -> """ + Poss 

answer_grammar

"\n  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]\n  O2 -> 'employee' [0.16666666666666666] | 'customer' [0.16666666666666666] | 'client' [0.16666666666666666] | 'company' [0.16666666666666666] | 'payee' [0.16666666666666666] | 'recipient' [0.16666666666666666] \n  V -> 'is' [1.0]\n  SL -> '{SLOT}' [1.0]\n  Ap -> 's' [1.0]\n  O -> 'name' [0.125] | 'date of birth' [0.125] | 'first name' [0.125] | 'last name' [0.125] | 'amount owed' [0.125] | 'address' [0.125] | 'registration number' [0.125] | 'company number' [0.125] \n  Det -> 'the' [1.0]\n  Conj -> 'and' [1.0]\n  Comm -> ',' [1.0]\n  Pos -> 'of' [0.5] | 'to' [0.5] "

In [85]:
from nltk.grammar import CFG, PCFG
N = 10000
grammar_q = PCFG.fromstring(question_grammar)
grammar_a = PCFG.fromstring(answer_grammar)

with open('./out/questions.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_q, n=N), 1):
        #print('%3d. %s' % (n, ' '.join(sent)))
        outfile.write(' '.join(sent) + '\n')

with open('./out/answers.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_a, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))

  1. the name of the employee is {SLOT}
  2. the name of the customer is {SLOT}
  3. the name of the client is {SLOT}
  4. the name of the company is {SLOT}
  5. the name of the payee is {SLOT}
  6. the name of the recipient is {SLOT}
  7. the name to the employee is {SLOT}
  8. the name to the customer is {SLOT}
  9. the name to the client is {SLOT}
 10. the name to the company is {SLOT}
 11. the name to the payee is {SLOT}
 12. the name to the recipient is {SLOT}
 13. the date of birth of the employee is {SLOT}
 14. the date of birth of the customer is {SLOT}
 15. the date of birth of the client is {SLOT}
 16. the date of birth of the company is {SLOT}
 17. the date of birth of the payee is {SLOT}
 18. the date of birth of the recipient is {SLOT}
 19. the date of birth to the employee is {SLOT}
 20. the date of birth to the customer is {SLOT}
 21. the date of birth to the client is {SLOT}
 22. the date of birth to the company is {SLOT}
 23. the date of birth to the payee is {SLOT}
 2

In [86]:
answers = list(" ".join(g) for g in generate(grammar_a, n=N))
answers

['the name of the employee is {SLOT}',
 'the name of the customer is {SLOT}',
 'the name of the client is {SLOT}',
 'the name of the company is {SLOT}',
 'the name of the payee is {SLOT}',
 'the name of the recipient is {SLOT}',
 'the name to the employee is {SLOT}',
 'the name to the customer is {SLOT}',
 'the name to the client is {SLOT}',
 'the name to the company is {SLOT}',
 'the name to the payee is {SLOT}',
 'the name to the recipient is {SLOT}',
 'the date of birth of the employee is {SLOT}',
 'the date of birth of the customer is {SLOT}',
 'the date of birth of the client is {SLOT}',
 'the date of birth of the company is {SLOT}',
 'the date of birth of the payee is {SLOT}',
 'the date of birth of the recipient is {SLOT}',
 'the date of birth to the employee is {SLOT}',
 'the date of birth to the customer is {SLOT}',
 'the date of birth to the client is {SLOT}',
 'the date of birth to the company is {SLOT}',
 'the date of birth to the payee is {SLOT}',
 'the date of birth to th

In [87]:
import re, string
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
import random
from random import randint
import datetime


path = "/virtualmachines/data/companies"
files = [join(path,f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".xml")]

def generate_names():
    with open(random.choice(files), "r") as infile:
        for line in infile:
            z = re.search(r'<NonIndividualNameText>([^</]+)</NonIndividualNameText>',line)
            if z:
                yield " ".join([n.capitalize() for n in z.group(1).replace("&amp;", "&").split(" ")])
names = generate_names()                
def generate_date():
    formats = ['%d, %b %Y', '%d %b %Y', '%d %B %Y', '%d %m %Y','%d-%m-%Y', '%d/%m/%Y','%Y-%m-%d', '%Y/%m/%d']
    return datetime.date(randint(1950,2025), randint(1,12),randint(1,28)).strftime(random.choice(formats))

def generate_id():
    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(random.randint(3,20))).upper()
    
def generate_amount():
    ret_string = "";
    if random.random() > 0.5:
        ret_string = random.choice(["US","AU","A", "US$", "$US", "AU$", "$AU", "A$", "$A", "GBP", "GBP£", "£GBP"])
        
    if random.random() > 0.5:
        ret_string = ret_string + "{:,}".format(random.randint(0,100000))
    else:
        ret_string = ret_string + "{:}".format(random.randint(0,100000))
    
    if random.random() > 0.25:
        ret_string += "." + str(random.randint(0,99));
    return ret_string

def generate_address():
    ret_string = str(random.randint(1,1000))
    if random.random() > 0.25:
        ret_string += random.choice(string.ascii_uppercase)
    if random.random() > 0.25:
        ret_string += "/" + str(random.randint(1,1000))
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    ret_string += " " + random.choice(["St", "Street", "Place", "Pl", "Road", "Rd", "Way", "Wy", "Alley", "All", "Lane", "Ln"])
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(["SA", "South Australia", "Victoria", "Queensland", "QLD", "Qld", "New South Wales", "NSW", "Greensborough", "VA", "Virginia", "Massachusetts", "MA", 
                                       "Tasmania", "Tas", "WA", "TX", "Texas", "Washington", "HI", "Hawaii"])
    if random.random() > 0.25:        
        ret_string += " " + "".join(random.choice(string.digits) for _ in range(random.randint(4,8)))
    return ret_string


In [88]:
filled = []
for answer in answers:
    answer = answer.replace(" s ", "'s ")    
    slots = []
    while "{SLOT}" in answer:
        search = re.search(r"(first|last)? name ([^{SLOT}]+) {SLOT}", answer)
        if search:
            name = next(names)
            answer = re.sub(r"(first|last)? name ([^{SLOT}]+) {SLOT}", r"\1 name \2 " + name, answer, 1)
            slots.append((search.groups()[0] + " name" if search.groups()[0] is not None else "name", name))
        search = re.search(r"date of birth ([^{SLOT}]+) {SLOT}", answer)
        if search:
            dob = generate_date()
            answer = re.sub(r"date of birth ([^{SLOT}]+) {SLOT}", r"date of birth \1 " + dob, answer, 1)
            slots.append(("date of birth", dob))
        search = re.search(r"owed ([^{SLOT}]+) {SLOT}", answer)
        if search:
            amount = generate_amount();
            answer = re.sub(r"owed ([^{SLOT}]+) {SLOT}", r"owed \1 " + amount, answer, 1)
            slots.append(("amount owed", amount))
        search = re.search(r"address ([^{SLOT}]+) {SLOT}", answer)
        if search:
            address = generate_address()
            answer = re.sub(r"address ([^{SLOT}]+) {SLOT}", r"address \1 " + address, answer, 1)
            slots.append(("address", address))
        search = re.search(r"(registration|company) number ([^{SLOT}]+) {SLOT}", answer)
        if search:
            gen_id = generate_id()
            answer = re.sub(r"(registration|company) number ([^{SLOT}]+) {SLOT}", r"\1 number \2 " + gen_id, answer, 1)
            slots.append((search.groups()[0] + " number", gen_id))
        valid = re.search("(name|address|birth|number) to|owed of|number to|date of birth of the company", answer) == None
        valid = valid and all([len(re.findall(o, answer)) <= 1 for o in O if o != "'name'"])
    filled.append((answer, valid, slots))
filled

[('the name of the employee is Evolution Polymers Pty Ltd',
  True,
  [('name', 'Evolution Polymers Pty Ltd')]),
 ('the name of the customer is The Trustee For The Remzi Mulla & Henry Mulla Family Trust',
  True,
  [('name', 'The Trustee For The Remzi Mulla & Henry Mulla Family Trust')]),
 ('the name of the client is Mum & Bub Pty. Ltd.',
  True,
  [('name', 'Mum & Bub Pty. Ltd.')]),
 ('the name of the company is Telcollect Pty Ltd',
  True,
  [('name', 'Telcollect Pty Ltd')]),
 ('the name of the payee is Kerrie Knight', True, [('name', 'Kerrie Knight')]),
 ('the name of the recipient is Wigram St Pty Ltd',
  True,
  [('name', 'Wigram St Pty Ltd')]),
 ('the name to the employee is Casax Pty Ltd',
  False,
  [('name', 'Casax Pty Ltd')]),
 ('the name to the customer is Target Sales Force Pty Ltd',
  False,
  [('name', 'Target Sales Force Pty Ltd')]),
 ('the name to the client is Coldstream Industrial Services Pty Ltd',
  False,
  [('name', 'Coldstream Industrial Services Pty Ltd')]),
 ('

In [89]:
all([len(re.findall(o, "the first name of the employee is Cranium Productions and the first name is The Trustee For Glen Oak Trust")) <= 1 for o in O if o != "'name'"])

False

In [None]:
  Q -> 'what' [0.1428] | 'who' [0.1428] | 'where' [0.1428] | 'when' [0.1428] | 'how much' [0.1428] | 'will' [0.1428] | 'which' [0.1428]
