In [1]:
import itertools
import sys
from nltk.grammar import Nonterminal

In [33]:
def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError(
                    "The grammar has rule(s) that yield infinite recursion!!"
                )
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth - 1):
                    yield frag
        else:
            yield [item]

O =   ["name", 
       "title",
       "date of birth", 
       "first name", 
       "last name", 
       "amount owed", 
       "amount due",
       "address", 
       "registration number", 
       "company number"]
Os = (" [{0}] | ".format(1 / len(O))).join(["'{0}'".format(o) for o in O])            
Os += " [{0}] ".format(1 / len(O))

Det = ["the"] 
Dets = "'{0}' [1.0]".format(Det[0])

Conj = ["and"]
Conjs = "'{0}' [1.0]".format(Conj[0])

Comm = [","]
Comms = "'{0}' [1.0]".format(Comm[0])

Pos = ["of", "to"]
Poss = (" [{0}] | ".format(1 / len(Pos))).join(["'{0}'".format(p) for p in Pos])            
Poss += " [{0}] ".format(1 / len(Pos))

O2 = ["employee", "customer", "client", "company", "payee", "recipient", "project", "card", "account"]
O2s = (" [{0}] | ".format(1 / len(O2))).join(["'{0}'".format(o2) for o2 in O2])            
O2s += " [{0}] ".format(1 / len(O2))

question_grammar = """
  S -> QP T [1.0]
  QP -> Q PR [1.0]
  PR -> Det OP [0.333] | Det O Conj O Pos Det O2 [0.333] | Det O Comm O Conj O Pos Det O2 [0.333]
  OP -> O Pos Det O2 [0.5] | O2 Ap O [0.5]
  Q -> 'what is' [1]
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  O -> """ + Os + """
  Pos -> """ + Poss + """
  Ap -> 's' [1.0]
  O2 -> """ + O2s + """
  T -> '?' [1.0]
"""
question_grammar



answer_grammar = """
  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]
  O2 -> """ + O2s + """
  V -> 'is' [1.0]
  SL -> '{SLOT}' [1.0]
  Ap -> 's' [1.0]
  O -> """ + Os + """
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  Pos -> """ + Poss 

answer_grammar

"\n  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]\n  O2 -> 'employee' [0.1111111111111111] | 'customer' [0.1111111111111111] | 'client' [0.1111111111111111] | 'company' [0.1111111111111111] | 'payee' [0.1111111111111111] | 'recipient' [0.1111111111111111] | 'project' [0.1111111111111111] | 'card' [0.1111111111111111] | 'account' [0.1111111111111111] \n  V -> 'is' [1.0]\n  SL -> '{SLOT}' [1.0]\n  Ap -> 's' [1.0]\n  O -> 'name' [0.1] | 'title' [0.1] | 'date of birth' [0.1] | 'first name' [0.1] | 'last name' [0.1] | 'amount owed' [0.1] | 'amount due' [0.1] | 'address' [0.1] | 'registration number' [0.1] | 'company number' [0.1] \n  Det -> 'the' [1.0]\n  Conj -> 'and' [1.0]\n  Comm -> ',' [1.0]\n  Pos -> 'of' [0.5] | 'to' [0.5] "

In [34]:
from nltk.grammar import CFG, PCFG
N = 100000
grammar_q = PCFG.fromstring(question_grammar)
grammar_a = PCFG.fromstring(answer_grammar)

with open('./out/questions.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_q, n=N), 1):
        #print('%3d. %s' % (n, ' '.join(sent)))
        outfile.write(' '.join(sent) + '\n')

with open('./out/answers.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_a, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))

  1. the name of the employee is {SLOT}
  2. the name of the customer is {SLOT}
  3. the name of the client is {SLOT}
  4. the name of the company is {SLOT}
  5. the name of the payee is {SLOT}
  6. the name of the recipient is {SLOT}
  7. the name of the project is {SLOT}
  8. the name of the card is {SLOT}
  9. the name of the account is {SLOT}
 10. the name to the employee is {SLOT}
 11. the name to the customer is {SLOT}
 12. the name to the client is {SLOT}
 13. the name to the company is {SLOT}
 14. the name to the payee is {SLOT}
 15. the name to the recipient is {SLOT}
 16. the name to the project is {SLOT}
 17. the name to the card is {SLOT}
 18. the name to the account is {SLOT}
 19. the title of the employee is {SLOT}
 20. the title of the customer is {SLOT}
 21. the title of the client is {SLOT}
 22. the title of the company is {SLOT}
 23. the title of the payee is {SLOT}
 24. the title of the recipient is {SLOT}
 25. the title of the project is {SLOT}
 26. the title of the

2737. the project s amount due is {SLOT} and amount due is {SLOT}
2738. the project s amount due is {SLOT} and address is {SLOT}
2739. the project s amount due is {SLOT} and registration number is {SLOT}
2740. the project s amount due is {SLOT} and company number is {SLOT}
2741. the project s address is {SLOT} and name is {SLOT}
2742. the project s address is {SLOT} and title is {SLOT}
2743. the project s address is {SLOT} and date of birth is {SLOT}
2744. the project s address is {SLOT} and first name is {SLOT}
2745. the project s address is {SLOT} and last name is {SLOT}
2746. the project s address is {SLOT} and amount owed is {SLOT}
2747. the project s address is {SLOT} and amount due is {SLOT}
2748. the project s address is {SLOT} and address is {SLOT}
2749. the project s address is {SLOT} and registration number is {SLOT}
2750. the project s address is {SLOT} and company number is {SLOT}
2751. the project s registration number is {SLOT} and name is {SLOT}
2752. the project s regis

In [35]:
questions = list(" ".join(g).replace(" ?", "?").replace(" ,", ",") for g in generate(grammar_q, n=N))
answers = list(" ".join(g) for g in generate(grammar_a, n=N))
questions = [(q, ((q.find(o), q.find(o) + len(o)) for o in O if q.find(o))) for q in questions]

In [36]:
import re, string
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
import random
from random import randint
import datetime


path = "/virtualmachines/data/companies"
files = [join(path,f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".xml")]

def generate_names():
    with open(random.choice(files), "r") as infile:
        for line in infile:
            z = re.search(r'<NonIndividualNameText>([^</]+)</NonIndividualNameText>',line)
            if z:
                yield " ".join([n.capitalize() for n in z.group(1).replace("&amp;", "&").split(" ")])
names = generate_names()                
def generate_date():
    formats = ['%d, %b %Y', '%d %b %Y', '%d %B %Y', '%d %m %Y','%d-%m-%Y', '%d/%m/%Y','%Y-%m-%d', '%Y/%m/%d']
    return datetime.date(randint(1950,2025), randint(1,12),randint(1,28)).strftime(random.choice(formats))

def generate_id():
    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(random.randint(3,20))).upper()
    
def generate_amount():
    ret_string = "";
    if random.random() > 0.5:
        ret_string = random.choice(["US","AU","A", "US$", "$US", "AU$", "$AU", "A$", "$A", "GBP", "GBP£", "£GBP"])
        
    if random.random() > 0.5:
        ret_string = ret_string + "{:,}".format(random.randint(0,100000))
    else:
        ret_string = ret_string + "{:}".format(random.randint(0,100000))
    
    if random.random() > 0.25:
        ret_string += "." + str(random.randint(0,99));
    return ret_string

def generate_address():
    ret_string = str(random.randint(1,1000))
    if random.random() > 0.25:
        ret_string += random.choice(string.ascii_uppercase)
    if random.random() > 0.25:
        ret_string += "/" + str(random.randint(1,1000))
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    ret_string += " " + random.choice(["St", "Street", "Place", "Pl", "Road", "Rd", "Way", "Wy", "Alley", "All", "Lane", "Ln"])
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(["SA", "South Australia", "Victoria", "Queensland", "QLD", "Qld", "New South Wales", "NSW", "Greensborough", "VA", "Virginia", "Massachusetts", "MA", 
                                       "Tasmania", "Tas", "WA", "TX", "Texas", "Washington", "HI", "Hawaii"])
    if random.random() > 0.25:        
        ret_string += " " + "".join(random.choice(string.digits) for _ in range(random.randint(4,8)))
    return ret_string


In [37]:
answers

['the name of the employee is {SLOT}',
 'the name of the customer is {SLOT}',
 'the name of the client is {SLOT}',
 'the name of the company is {SLOT}',
 'the name of the payee is {SLOT}',
 'the name of the recipient is {SLOT}',
 'the name of the project is {SLOT}',
 'the name of the card is {SLOT}',
 'the name of the account is {SLOT}',
 'the name to the employee is {SLOT}',
 'the name to the customer is {SLOT}',
 'the name to the client is {SLOT}',
 'the name to the company is {SLOT}',
 'the name to the payee is {SLOT}',
 'the name to the recipient is {SLOT}',
 'the name to the project is {SLOT}',
 'the name to the card is {SLOT}',
 'the name to the account is {SLOT}',
 'the title of the employee is {SLOT}',
 'the title of the customer is {SLOT}',
 'the title of the client is {SLOT}',
 'the title of the company is {SLOT}',
 'the title of the payee is {SLOT}',
 'the title of the recipient is {SLOT}',
 'the title of the project is {SLOT}',
 'the title of the card is {SLOT}',
 'the titl

In [38]:
def validate(text):
    if text is None:
        return False
    valid = re.search("(name|address|birth|number) to|owed of|number to|date of birth of the company", text) == None            
    valid = valid and all([len(re.findall(o, text)) <= 1 for o in O if o != "'name'"])
    return valid
    
filled = []
for answer in answers:
    answer = answer.replace(" s ", "'s ")    
    slots = []
    while "{SLOT}" in answer:
        for slot, gen_func in {r"(first name|last name|name)":lambda: next(names), 
                               r"(title)": lambda: next(names), 
                               r"(date of birth)": generate_date, 
                               r"(amount owed|amount due)": generate_amount, 
                               r"(address)": generate_address,
                               r"(registration number|company number)": generate_id}.items():
            search = re.search(slot + r" ([^{SLOT}]+) {SLOT}", answer)
            if search:
                fill = gen_func()
                prev = answer
                answer = re.sub(slot + r" ([^{SLOT}]+) {SLOT}", r"\1 \2 " + fill, answer, 1)

                pos = answer.find(fill)
                for slot in slots:
                    for slot_pair in slot:
                        if slot_pair[0] >= pos:
                            slot_pair[0] += len(fill) - len("{SLOT}")
                        if slot_pair[1] >= pos:
                            slot_pair[1] += len(fill) - len("{SLOT}")
                        
                slot_name_span = [answer.find(search.groups()[0]), answer.find(search.groups()[0]) + len(search.groups()[0])]
                fill_span = [answer.find(fill), answer.find(fill) + len(fill)]
                extract = answer[slot_name_span[0]:slot_name_span[1]]
                if extract not in O:
                    print(extract)                
                slots.append([slot_name_span, fill_span])
        valid = validate(answer)        
        
    filled.append((answer, valid, slots))
#filled

In [39]:
with open("out/answers.csv", "w") as outfile:
    for line in filled:
        outfile.write(line[0].replace(",", "{COMMA}"))
        outfile.write(",")
        outfile.write(str(line[1] == True))
        outfile.write("\n")
        
with open("out/question_answer_spans.csv", "w") as outfile:
    for line in filled:
        if line[1]:
            slot_spans = list(map(lambda x: x[0], line[2]))
            fill_spans = list(map(lambda x: x[1], line[2]))
            slot_names = [line[0][slot[0]:slot[1]] for slot in slot_spans]
            i = 0
            matching_questions = [q for q in questions if all([slot_name in q[0] for slot_name in slot_names]) and validate(q[0])]
            q = random.choice([q for q in matching_questions if all([(o in q[0]) == (o in line[0]) for o in O2])])
            outfile.write(q[0].replace(",", "{COMMA}"))
            outfile.write(",")
            outfile.write(line[0].replace(",", "{COMMA}"))
            outfile.write(",")
            
            for i in range(len(slot_spans)):
                slot_span = slot_spans[i]
                fill_span = fill_spans[i]
                slot_name = slot_names[i]
                outfile.write("{0}|{1}|{2}|{3}|".format(q[0].find(slot_name), 
                                                       q[0].find(slot_name) + len(slot_name),
                                                       fill_span[0],
                                                       fill_span[1]))
                #spans = "|".join(map(str, itertools.chain.from_iterable(span_pair)))                
            #outfile.write(",")
            #outfile.write("1" if valid else "0")
            outfile.write("\n")

In [None]:
  Q -> 'what' [0.1428] | 'who' [0.1428] | 'where' [0.1428] | 'when' [0.1428] | 'how much' [0.1428] | 'will' [0.1428] | 'which' [0.1428]
