In [1]:
import pandas as pd
import numpy as np

import dynet as dy
from collections import Counter
import pickle
import random
import json
import sys

In [2]:
geography = pd.read_json('txt2sql-data/geography.json')

In [None]:
## Functions and methods for preprocessing input data

In [None]:
# Explain function

In [3]:
def insert_variables(sql, sql_variables, sent, sent_variables):
    tokens = []
    tags = []
    seen_sent_variables = set()
    for token in sent.strip().split():
        if (token not in sent_variables):
            tokens.append(token)
            tags.append("O")
        else:
            assert len(sent_variables[token]) > 0
            seen_sent_variables.add(token)
            for word in sent_variables[token].split():
                tokens.append(word)
                tags.append(token)

    sql_tokens = []
    for token in sql.strip().split():
        if token.startswith('"%') or token.startswith("'%"):
            sql_tokens.append(token[:2])
            token = token[2:]
        elif token.startswith('"') or token.startswith("'"):
            sql_tokens.append(token[0])
            token = token[1:]

        if token.endswith('%"') or token.endswith("%'"):
            sql_tokens.append(token[:-2])
            sql_tokens.append(token[-2:])
        elif token.endswith('"') or token.endswith("'"):
            sql_tokens.append(token[:-1])
            sql_tokens.append(token[-1])
        else:
            sql_tokens.append(token)

    template = []
    complete = []
    for token in sql_tokens:
        # Do the template
        if token in seen_sent_variables:
            # The token is a variable name that will be copied from the sentence
            template.append(token)
        elif (token not in sent_variables) and (token not in sql_variables):
            # The token is an SQL keyword
            template.append(token)
        elif token in sent_variables and sent_variables[token] != '':
            # The token is a variable whose value is unique to this questions,
            # but is not explicitly given
            template.append(sent_variables[token])
        else:
            # The token is a variable whose value is not unique to this
            # question and not explicitly given
            template.append(sql_variables[token])

        # Do the complete case
        if token in sent_variables and sent_variables[token] != '':
            complete.append(sent_variables[token])
        elif token in sql_variables:
            complete.append(sql_variables[token])
        else:
            complete.append(token)

    return (tokens, tags, ' '.join(template), ' '.join(complete))

In [None]:
# Explain function

In [4]:
def get_tagged_data_for_query(data):
    
    dataset = data['query-split']

    for sent_info in data['sentences']:
        for sql in data['sql']:
            sql_vars = {}
            for sql_var in data['variables']:
                sql_vars[sql_var['name']] = sql_var['example']
            text = sent_info['text']
            text_vars = sent_info['variables']

            yield (dataset, insert_variables(sql, sql_vars, text, text_vars))

In [None]:
# Explain class and functions

In [5]:
class Vocab:
    def __init__(self, w2i):
        self.w2i = dict(w2i)
        self.i2w = {i:w for w,i in w2i.items()}

    @classmethod
    def from_corpus(cls, corpus):
        w2i = {}
        for word in corpus:
            w2i.setdefault(word, len(w2i))
        return Vocab(w2i)

    def size(self):
        return len(self.w2i.keys())

def build_vocab(sentences):
    counts = Counter()
    words = {"<UNK>"}
    tag_set = set()
    template_set = set()
    for tokens, tags, template, complete in train:
        template_set.add(template)
        for tag in tags:
            tag_set.add(tag)
        for token in tokens:
            counts[token] += 1

    for word in counts:
        if counts[word] > 0:
            words.add(word)

    vocab_tags = Vocab.from_corpus(tag_set)
    vocab_words = Vocab.from_corpus(words)
    vocab_templates = Vocab.from_corpus(template_set)

    return vocab_words, vocab_tags, vocab_templates

In [None]:
## Input data pipeline

In [None]:
# Explain

In [6]:
train = []
dev = []
test = []
with open('txt2sql-data/geography.json') as input_file:
    data = json.load(input_file)
    for example in data:
        for dataset, instance in get_tagged_data_for_query(example):
            if dataset == 'train':
                train.append(instance)
            elif dataset == 'dev':
                train.append(instance)
            elif dataset == 'test':
                test.append(instance)
            elif dataset == 'exclude':
                pass
            else:
                assert False, dataset

In [7]:
vocab_words, vocab_tags, vocab_templates = build_vocab(train)
UNK = vocab_words.w2i["<UNK>"]
NWORDS = vocab_words.size()
NTAGS = vocab_tags.size()
NTEMPLATES = vocab_templates.size()

print(f"Running with {NTEMPLATES} templates", f"{NWORDS} words", f"{NTAGS} tags", f"{UNK} ignored words")

Running with 207 templates 267 words 3 tags 113 ignored words


In [14]:
## Building the model
# [Enter description]
# Dropout is a regularization method that prevents activation and weight updates for some input and recurrent connections to LSTM units during training.
# This could help reducing overfitting and improve the model's performance.

In [17]:
model = dy.Model()

# This trainer performs stochastic gradient descent
# this is the most common optimization procedure for neural networks.
trainer = dy.SimpleSGDTrainer(model, learning_rate=0.01)

## Hyperparameters

# Input dimension 
DIM_WORD = 128

# Number of input layers
LSTM_LAYERS = 2

# Dimension of the recurrent units
DIM_HIDDEN_LSTM = 64

# Dimension of the hidden layers
DIM_HIDDEN_MLP = 32

# Dimension of the hidden layers that predict the template
DIM_HIDDEN_TEMPLATE = 64

# Word Embeddings
pEmbedding = model.add_lookup_parameters((NWORDS, DIM_WORD))

# Dropout rate
DROPOUT = 0.1


# Layers  
pHidden = model.add_parameters((DIM_HIDDEN_MLP, DIM_HIDDEN_LSTM*2))
pOutput = model.add_parameters((NTAGS, DIM_HIDDEN_MLP))

# This allows us to create a standard LSTM
builders = [
    dy.LSTMBuilder(LSTM_LAYERS, DIM_WORD, DIM_HIDDEN_LSTM, model),
    dy.LSTMBuilder(LSTM_LAYERS, DIM_WORD, DIM_HIDDEN_LSTM, model),
]

pHiddenTemplate = model.add_parameters((DIM_HIDDEN_TEMPLATE, DIM_HIDDEN_LSTM*2))
pOutputTemplate = model.add_parameters((NTEMPLATES, DIM_HIDDEN_TEMPLATE))

In [20]:
## Training the model

def build_tagging_graph(words, tags, template, builders, train=True):
    dy.renew_cg()
    
    if DROPOUT > 0:
    # Sets the dropout rate
        for b in builders:
            b.set_dropouts(DROPOUT, DROPOUT)
            
    # Initialize bi-directional LSTM 
    f_init, b_init = [b.initial_state() for b in builders]
    
    wembs = [dy.lookup(pEmbedding, w) for w in words]
    
    # This adds noise to the training data as a regularizer
    if train: 
        wembs = [dy.noise(we, args.train_noise) for we in wembs]
    

    fw_states = [x for x in f_init.add_inputs(wembs)]
    fw = [x.output() for x in fw_states]
    
    bw_states = [x for x in b_init.add_inputs(reversed(wembs))]
    bw = [x.output() for x in bw_states]    
    
    O = dy.parameter(pOutput)
    H = dy.parameter(pHidden)
    
    #Errors
    errs = []
    #Predicted tags
    pred_tags = []
    
    
    for f, b, t in zip(fw, reversed(bw), tags):
        f_b = dy.concatenate([f,b])
        # Hyperbolic tangent activation function
        f_b = dy.tanh(H * f_b)
        r_t = O * f_b
        
        if train:
            err = dy.pickneglogsoftmax(r_t, t)
            errs.append(err)
        else:
            out = dy.softmax(r_t)
            chosen = np.argmax(out.npvalue())
            pred_tags.append(vocab_tags.i2w[chosen])
            
        return pred_tags, pred_template, errs

In [21]:
def insert_tagged_tokens(tokens, tags, template):
    to_insert = {}
    cur = (None, [])
    for token, tag in zip(tokens, tags):
        if tag != cur[0]:
            if cur[0] is not None:
                value = ' '.join(cur[1])
                to_insert[cur[0]] = value
            if tag == 'O':
                cur = (None, [])
            else:
                cur = (tag, [token])
        else:
            cur[1].append(token)
    if cur[0] is not None:
        value = ' '.join(cur[1])
        to_insert[cur[0]] = value

    modified = []
    for token in template.split():
        modified.append(to_insert.get(token, token))

    return ' '.join(modified)

In [22]:
def run_eval(data, builders, iteration, step):
    if len(data) == 0:
        print("No data for eval")
        return -1
    correct_tags = 0.0
    total_tags = 0.0
    complete_match = 0.0
    templates_match = 0.0
    oracle = 0.0
    for tokens, tags, template, complete in data:
        word_ids = [vocab_words.w2i.get(word, UNK) for word in tokens]
        tag_ids = [0 for tag in tags]
        pred_tags, pred_template, _ = build_tagging_graph(word_ids, tag_ids, 0, builders, False)
        gold_tags = tags
        for gold, pred in zip(gold_tags, pred_tags):
            total_tags += 1
            if gold == pred: correct_tags += 1
        pred_complete = insert_tagged_tokens(tokens, pred_tags, pred_template)
        if pred_complete == complete:
            complete_match += 1
        if pred_template == template:
            templates_match += 1
        if template in vocab_templates.w2i:
            oracle += 1

    tok_acc = correct_tags / total_tags
    complete_acc = complete_match / len(data)
    template_acc = templates_match / len(data)
    oracle_acc = oracle / len(data)
    
    print(f"Eval {iteration} - {step} Tag Acc: {tok_acc} Template: {template_acc} Complete: {complete_acc}")
    return complete_acc