In [1]:
import itertools
import sys
from nltk.grammar import Nonterminal

In [55]:
def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError(
                    "The grammar has rule(s) that yield infinite recursion!!"
                )
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth - 1):
                    yield frag
        else:
            yield [item]

O =   ["name", 
       "title",
       "date of birth", 
       "first name", 
       "last name", 
       "amount owed", 
       "amount due",
       "address", 
       "registration number", 
       "company number"]
Objects = O
Os = (" [{0}] | ".format(1 / len(O))).join(["'{0}'".format(o) for o in O])            
Os += " [{0}] ".format(1 / len(O))

Det = ["the"] 
Dets = "'{0}' [1.0]".format(Det[0])

Conj = ["and"]
Conjs = "'{0}' [1.0]".format(Conj[0])

Comm = [","]
Comms = "'{0}' [1.0]".format(Comm[0])

Pos = ["of", "to"]
Poss = (" [{0}] | ".format(1 / len(Pos))).join(["'{0}'".format(p) for p in Pos])            
Poss += " [{0}] ".format(1 / len(Pos))

O2 = ["employee", "customer", "client", "company", "payee", "recipient", "project", "card", "account"]
O2s = (" [{0}] | ".format(1 / len(O2))).join(["'{0}'".format(o2) for o2 in O2])            
O2s += " [{0}] ".format(1 / len(O2))

question_grammar = """
  S -> QP T [1.0]
  QP -> Q PR [1.0]
  PR -> Det OP [0.333] | Det O Conj O Pos Det O2 [0.333] | Det O Comm O Conj O Pos Det O2 [0.333]
  OP -> O Pos Det O2 [0.5] | O2 Ap O [0.5]
  Q -> 'what is' [1]
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  O -> """ + Os + """
  Pos -> """ + Poss + """
  Ap -> 's' [1.0]
  O2 -> """ + O2s + """
  T -> '?' [1.0]
"""
question_grammar



answer_grammar = """
  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]
  O2 -> """ + O2s + """
  V -> 'is' [1.0]
  SL -> '{SLOT}' [1.0]
  Ap -> 's' [1.0]
  O -> """ + Os + """
  Det -> """ + Dets + """
  Conj -> """ + Conjs + """
  Comm -> """ + Comms + """
  Pos -> """ + Poss 

answer_grammar

"\n  S -> Det O Pos Det O2 V SL [0.25] | Det O Pos Det O2 V SL Conj Det O V SL [0.25] | Det O2 Ap O V SL [0.25] | Det O2 Ap O V SL Conj O V SL [0.25]\n  O2 -> 'employee' [0.1111111111111111] | 'customer' [0.1111111111111111] | 'client' [0.1111111111111111] | 'company' [0.1111111111111111] | 'payee' [0.1111111111111111] | 'recipient' [0.1111111111111111] | 'project' [0.1111111111111111] | 'card' [0.1111111111111111] | 'account' [0.1111111111111111] \n  V -> 'is' [1.0]\n  SL -> '{SLOT}' [1.0]\n  Ap -> 's' [1.0]\n  O -> 'name' [0.1] | 'title' [0.1] | 'date of birth' [0.1] | 'first name' [0.1] | 'last name' [0.1] | 'amount owed' [0.1] | 'amount due' [0.1] | 'address' [0.1] | 'registration number' [0.1] | 'company number' [0.1] \n  Det -> 'the' [1.0]\n  Conj -> 'and' [1.0]\n  Comm -> ',' [1.0]\n  Pos -> 'of' [0.5] | 'to' [0.5] "

In [56]:
from nltk.grammar import CFG, PCFG
N = 100000
grammar_q = PCFG.fromstring(question_grammar)
grammar_a = PCFG.fromstring(answer_grammar)

with open('./out/questions.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_q, n=N), 1):
        #print('%3d. %s' % (n, ' '.join(sent)))
        outfile.write(' '.join(sent) + '\n')

with open('./out/answers.csv', 'w') as outfile:
    for n, sent in enumerate(generate(grammar_a, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))

  1. the name of the employee is {SLOT}
  2. the name of the customer is {SLOT}
  3. the name of the client is {SLOT}
  4. the name of the company is {SLOT}
  5. the name of the payee is {SLOT}
  6. the name of the recipient is {SLOT}
  7. the name of the project is {SLOT}
  8. the name of the card is {SLOT}
  9. the name of the account is {SLOT}
 10. the name to the employee is {SLOT}
 11. the name to the customer is {SLOT}
 12. the name to the client is {SLOT}
 13. the name to the company is {SLOT}
 14. the name to the payee is {SLOT}
 15. the name to the recipient is {SLOT}
 16. the name to the project is {SLOT}
 17. the name to the card is {SLOT}
 18. the name to the account is {SLOT}
 19. the title of the employee is {SLOT}
 20. the title of the customer is {SLOT}
 21. the title of the client is {SLOT}
 22. the title of the company is {SLOT}
 23. the title of the payee is {SLOT}
 24. the title of the recipient is {SLOT}
 25. the title of the project is {SLOT}
 26. the title of the

2681. the project s title is {SLOT} and name is {SLOT}
2682. the project s title is {SLOT} and title is {SLOT}
2683. the project s title is {SLOT} and date of birth is {SLOT}
2684. the project s title is {SLOT} and first name is {SLOT}
2685. the project s title is {SLOT} and last name is {SLOT}
2686. the project s title is {SLOT} and amount owed is {SLOT}
2687. the project s title is {SLOT} and amount due is {SLOT}
2688. the project s title is {SLOT} and address is {SLOT}
2689. the project s title is {SLOT} and registration number is {SLOT}
2690. the project s title is {SLOT} and company number is {SLOT}
2691. the project s date of birth is {SLOT} and name is {SLOT}
2692. the project s date of birth is {SLOT} and title is {SLOT}
2693. the project s date of birth is {SLOT} and date of birth is {SLOT}
2694. the project s date of birth is {SLOT} and first name is {SLOT}
2695. the project s date of birth is {SLOT} and last name is {SLOT}
2696. the project s date of birth is {SLOT} and amou

In [95]:
question_list = list(" ".join(g).replace(" ?", "?").replace(" ,", ",") for g in generate(grammar_q, n=N))
answers = list(" ".join(g) for g in generate(grammar_a, n=N))

class Span:
    def __init__(self, start, end):
        self.start = start
        self.end = end

class Question:
    def __init__(self, text, spans):
        self.text = text
        self.spans = spans
    
    def get_slot_names(self):
        for span in self.spans:
            yield self.text[span.start:span.end]

questions = []
for text in question_list:
    spans = [Span(text.find(o), text.find(o) + len(o)) for o in O if text.find(o)]
    questions.append(Question(text, spans))
    
questions


[<__main__.Question at 0x7f0c27fd4550>,
 <__main__.Question at 0x7f0c27fd45f8>,
 <__main__.Question at 0x7f0c27fd4a20>,
 <__main__.Question at 0x7f0c27fd4c88>,
 <__main__.Question at 0x7f0c27fd4ef0>,
 <__main__.Question at 0x7f0c27d6e198>,
 <__main__.Question at 0x7f0c27d6e400>,
 <__main__.Question at 0x7f0c27d6e668>,
 <__main__.Question at 0x7f0c27d6e8d0>,
 <__main__.Question at 0x7f0c27d6eb38>,
 <__main__.Question at 0x7f0c27d6eda0>,
 <__main__.Question at 0x7f0c27d73048>,
 <__main__.Question at 0x7f0c27d732b0>,
 <__main__.Question at 0x7f0c27d73518>,
 <__main__.Question at 0x7f0c27d73780>,
 <__main__.Question at 0x7f0c27d739e8>,
 <__main__.Question at 0x7f0c27d73c50>,
 <__main__.Question at 0x7f0c27d73eb8>,
 <__main__.Question at 0x7f0c27d77160>,
 <__main__.Question at 0x7f0c27d773c8>,
 <__main__.Question at 0x7f0c27d77630>,
 <__main__.Question at 0x7f0c27d77898>,
 <__main__.Question at 0x7f0c27d77b00>,
 <__main__.Question at 0x7f0c27d77d68>,
 <__main__.Question at 0x7f0c27d77fd0>,


In [58]:
import re, string
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join
import random
from random import randint
import datetime


path = "/virtualmachines/data/companies"
files = [join(path,f) for f in listdir(path) if isfile(join(path, f)) and f.endswith(".xml")]

def generate_names():
    with open(random.choice(files), "r") as infile:
        for line in infile:
            z = re.search(r'<NonIndividualNameText>([^</]+)</NonIndividualNameText>',line)
            if z:
                yield " ".join([n.capitalize() for n in z.group(1).replace("&amp;", "&").split(" ")])
names = generate_names()                
def generate_date():
    formats = ['%d, %b %Y', '%d %b %Y', '%d %B %Y', '%d %m %Y','%d-%m-%Y', '%d/%m/%Y','%Y-%m-%d', '%Y/%m/%d']
    return datetime.date(randint(1950,2025), randint(1,12),randint(1,28)).strftime(random.choice(formats))

def generate_id():
    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(random.randint(3,20))).upper()
    
def generate_amount():
    ret_string = "";
    if random.random() > 0.5:
        ret_string = random.choice(["US","AU","A", "US$", "$US", "AU$", "$AU", "A$", "$A", "GBP", "GBP£", "£GBP"])
        
    if random.random() > 0.5:
        ret_string = ret_string + "{:,}".format(random.randint(0,100000))
    else:
        ret_string = ret_string + "{:}".format(random.randint(0,100000))
    
    if random.random() > 0.25:
        ret_string += "." + str(random.randint(0,99));
    return ret_string

def generate_address():
    ret_string = str(random.randint(1,1000))
    if random.random() > 0.25:
        ret_string += random.choice(string.ascii_uppercase)
    if random.random() > 0.25:
        ret_string += "/" + str(random.randint(1,1000))
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    ret_string += " " + random.choice(["St", "Street", "Place", "Pl", "Road", "Rd", "Way", "Wy", "Alley", "All", "Lane", "Ln"])
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(next(names).split(" ")).capitalize()
    if random.random() > 0.25:
        ret_string += ","
    ret_string += " " + random.choice(["SA", "South Australia", "Victoria", "Queensland", "QLD", "Qld", "New South Wales", "NSW", "Greensborough", "VA", "Virginia", "Massachusetts", "MA", 
                                       "Tasmania", "Tas", "WA", "TX", "Texas", "Washington", "HI", "Hawaii"])
    if random.random() > 0.25:        
        ret_string += " " + "".join(random.choice(string.digits) for _ in range(random.randint(4,8)))
    return ret_string


In [66]:
answers = [answer.replace(" s ", "'s ") for answer in answers]


In [89]:
def validate(text):
    if text is None:
        return False
    valid = re.search("(name|address|birth|number) to|owed of|number to|date of birth of the company", text) == None            
    valid = valid and all([len(re.findall(o, text)) <= 1 for o in O if o != "'name'"])
    return valid

# represents two spans in a text - SLOT, representing the name of the slot, and TAG, representing the object that will fill the slot
# e.g. if text is "my name is bob", SLOT is "name", TAG is "bob"
class TaggedSlot:
    def __init__(self, slot_start,slot_end, tag_start, tag_end):
        self.slot_start = slot_start
        self.slot_end = slot_end
        self.tag_start = tag_start
        self.tag_end = tag_end
        
# slots are a sequence of [S1, S2], where 
class Answer:
    def __init__(self, text=None, slots=None, is_valid=None):
        self.text = text
        self.slots = slots
        self.is_valid = is_valid
        
    def get_slot_names(self):
        for slot in self.slots:
            yield self.text[slot.slot_start:slot.slot_end]
    
tagged = []
# iterate through every answer
# iterate through all potential slots, replacing the first instance of "{SLOT}" with the return value of the respective slot tag generator
slot_tag_generators = {r"(first name|last name|name)":lambda: next(names), 
                       r"(title)": lambda: next(names), 
                       r"(date of birth)": generate_date, 
                       r"(amount owed|amount due)": generate_amount, 
                       r"(address)": generate_address,
                       r"(registration number|company number)": generate_id}
for answer in answers:
    tagged_slots = []
    while "{SLOT}" in answer:
        for slot, generator in slot_tag_generators.items():
            # find the slot name in the answer
            search = re.search(slot + r" ([^{SLOT}]+) {SLOT}", answer)

            # if not found, skip to next slot name
            if search is None:
                continue
                
            # generate a tag for the slot
            fill = generator()
            
            # replace {SLOT} with the tag
            answer = re.sub(slot + r" ([^{SLOT}]+) {SLOT}", r"\1 \2 " + fill, answer, 1)
            
            # find the position of our replacement
            pos = answer.find(fill)
            # adjust any existing slot/tag spans to account for the changed text positions
            for tagged_slot in tagged_slots:
                if tagged_slot.slot_start >= pos:
                    tagged_slot.slot_start += len(fill) - len("{SLOT}")
                if tagged_slot.slot_end >= pos:
                    tagged_slot.slot_end += len(fill) - len("{SLOT}")
                if tagged_slot.tag_start >= pos:
                    tagged_slot.tag_start += len(fill) - len("{SLOT}")
                if tagged_slot.tag_end >= pos:
                    tagged_slot.tag_end += len(fill) - len("{SLOT}")
            # create a slot span
            slot = [answer.find(search.groups()[0]), answer.find(search.groups()[0]) + len(search.groups()[0])]
            # create a tag span
            tag = [answer.find(fill), answer.find(fill) + len(fill)]
            
            # double check that the slot actually came from our grammar and wasn't a mistake (e.g. off-by-1)
            extract = answer[slot[0]:slot[1]]
            if extract not in O:
                print(extract)                
                
            tagged_slots.append(TaggedSlot(slot[0], slot[1], tag[0],tag[1]))
    answer = Answer(text=answer, is_valid=validate(answer), slots=tagged_slots)    
    tagged.append(answer)
tagged

[<__main__.Answer at 0x7f0c2ac1b6a0>,
 <__main__.Answer at 0x7f0c2ac1b630>,
 <__main__.Answer at 0x7f0c2ac1b7f0>,
 <__main__.Answer at 0x7f0c2ac1b860>,
 <__main__.Answer at 0x7f0c2ac1b978>,
 <__main__.Answer at 0x7f0c2ac1b5c0>,
 <__main__.Answer at 0x7f0c2ac1b9b0>,
 <__main__.Answer at 0x7f0c2ac1ba58>,
 <__main__.Answer at 0x7f0c2ac1ba20>,
 <__main__.Answer at 0x7f0c2ac1bac8>,
 <__main__.Answer at 0x7f0c2ac1bb70>,
 <__main__.Answer at 0x7f0c2ac1bb00>,
 <__main__.Answer at 0x7f0c2ac1bc18>,
 <__main__.Answer at 0x7f0c2ac1bc88>,
 <__main__.Answer at 0x7f0c2ac1b828>,
 <__main__.Answer at 0x7f0c2ac1b898>,
 <__main__.Answer at 0x7f0c2ac1be10>,
 <__main__.Answer at 0x7f0c2ac1bba8>,
 <__main__.Answer at 0x7f0c2ac1bf28>,
 <__main__.Answer at 0x7f0c2ac1bfd0>,
 <__main__.Answer at 0x7f0c2ac1bcc0>,
 <__main__.Answer at 0x7f0c2ac1bcf8>,
 <__main__.Answer at 0x7f0c2ac1bef0>,
 <__main__.Answer at 0x7f0c2ac1bf98>,
 <__main__.Answer at 0x7f0c2ac26198>,
 <__main__.Answer at 0x7f0c2ac26128>,
 <__main__.A

In [98]:
question_lookup = {}

for slot_name in O:
    question_lookup[slot_name] = [question for question in questions if slot_name in question.get_slot_names()]

In [107]:
with open("out/answers.csv", "w") as outfile:
    for answer in tagged:
        outfile.write(answer.text.replace(",", "{COMMA}"))
        outfile.write(",")
        outfile.write(str(answer.is_valid))
        outfile.write("\n")
          
with open("out/question_answer_spans.csv", "w") as outfile:
    for answer in [answer for answer in tagged if answer.is_valid]:
        # iterate through every slot in the answer, pick a random question that contains that slot, and write it out
        # e.g. if the answer is "client's name is Bob", we first find a question "What is the client's name?"
        for tagged_slot in answer.slots:
            slot_name = answer.text[tagged_slot.slot_start:tagged_slot.slot_end]
            # find the random question
            question = random.choice(question_lookup[slot_name])
            # write the question text
            outfile.write(question.text.replace(",", "{COMMA}"))
            outfile.write(",")
            # write the answer text
            outfile.write(answer.text.replace(",", "{COMMA}"))
            outfile.write(",")
            tag = answer.text[tagged_slot.tag_start:tagged_slot.tag_end]
            outfile.write("{0}|{1}|{2}|{3}|".format(question.text.find(slot_name), 
                                               question.text.find(slot_name) + len(slot_name),
                                               tagged_slot.tag_start,
                                               tagged_slot.tag_end))
            outfile.write("\n")

        # then, find a question that does not contain any slots from this answer
        slot = random.choice([o for o in O if o not in answer.get_slot_names()])
        question = random.choice(question_lookup[slot])
        
        # write the question text
        outfile.write(question.text.replace(",", "{COMMA}"))
        outfile.write(",")
        # write the answer text
        outfile.write(answer.text.replace(",", "{COMMA}"))
        outfile.write(",")
        # write the slot/tag spans - the latter of course will be empty as the slot did not appear in the answer
        outfile.write("{0}|{1}|{2}|{3}|".format(question.text.find(slot), 
                                               question.text.find(slot) + len(slot),
                                               "",
                                               ""))
        outfile.write("\n")

In [None]:
  Q -> 'what' [0.1428] | 'who' [0.1428] | 'where' [0.1428] | 'when' [0.1428] | 'how much' [0.1428] | 'will' [0.1428] | 'which' [0.1428]
