In [3]:
import pandas as pd
import numpy as np

import dynet as dy
from collections import Counter
import pickle
import random
import json
import sys

import tensorflow as tf
print(tf.__version__)

ModuleNotFoundError: No module named 'dynet'

In [None]:
geography = pd.read_json('txt2sql-data/geography.json')

In [None]:
def insert_variables(sql, sql_variables, sent, sent_variables):
    tokens = []
    tags = []
    seen_sent_variables = set()
    for token in sent.strip().split():
        if (token not in sent_variables):
            tokens.append(token)
            tags.append("O")
        else:
            assert len(sent_variables[token]) > 0
            seen_sent_variables.add(token)
            for word in sent_variables[token].split():
                tokens.append(word)
                tags.append(token)

    sql_tokens = []
    for token in sql.strip().split():
        if token.startswith('"%') or token.startswith("'%"):
            sql_tokens.append(token[:2])
            token = token[2:]
        elif token.startswith('"') or token.startswith("'"):
            sql_tokens.append(token[0])
            token = token[1:]

        if token.endswith('%"') or token.endswith("%'"):
            sql_tokens.append(token[:-2])
            sql_tokens.append(token[-2:])
        elif token.endswith('"') or token.endswith("'"):
            sql_tokens.append(token[:-1])
            sql_tokens.append(token[-1])
        else:
            sql_tokens.append(token)

    template = []
    complete = []
    for token in sql_tokens:
        # Do the template
        if token in seen_sent_variables:
            # The token is a variable name that will be copied from the sentence
            template.append(token)
        elif (token not in sent_variables) and (token not in sql_variables):
            # The token is an SQL keyword
            template.append(token)
        elif token in sent_variables and sent_variables[token] != '':
            # The token is a variable whose value is unique to this questions,
            # but is not explicitly given
            template.append(sent_variables[token])
        else:
            # The token is a variable whose value is not unique to this
            # question and not explicitly given
            template.append(sql_variables[token])

        # Do the complete case
        if token in sent_variables and sent_variables[token] != '':
            complete.append(sent_variables[token])
        elif token in sql_variables:
            complete.append(sql_variables[token])
        else:
            complete.append(token)

    return (tokens, tags, ' '.join(template), ' '.join(complete))

In [None]:
def get_tagged_data_for_query(data):
    

    dataset = data['query-split']

    for sent_info in data['sentences']:
        for sql in data['sql']:
            sql_vars = {}
            for sql_var in data['variables']:
                sql_vars[sql_var['name']] = sql_var['example']
            text = sent_info['text']
            text_vars = sent_info['variables']

            yield (dataset, insert_variables(sql, sql_vars, text, text_vars))

In [None]:
class Vocab:
    def __init__(self, w2i):
        self.w2i = dict(w2i)
        self.i2w = {i:w for w,i in w2i.items()}

    @classmethod
    def from_corpus(cls, corpus):
        w2i = {}
        for word in corpus:
            w2i.setdefault(word, len(w2i))
        return Vocab(w2i)

    def size(self):
        return len(self.w2i.keys())

def build_vocab(sentences):
    counts = Counter()
    words = {"<UNK>"}
    tag_set = set()
    template_set = set()
    for tokens, tags, template, complete in train:
        template_set.add(template)
        for tag in tags:
            tag_set.add(tag)
        for token in tokens:
            counts[token] += 1

    for word in counts:
        if counts[word] > 0:
            words.add(word)

    vocab_tags = Vocab.from_corpus(tag_set)
    vocab_words = Vocab.from_corpus(words)
    vocab_templates = Vocab.from_corpus(template_set)

    return vocab_words, vocab_tags, vocab_templates

In [None]:
train = []
dev = []
test = []
with open('txt2sql-data/geography.json') as input_file:
    data = json.load(input_file)
    for example in data:
        for dataset, instance in get_tagged_data_for_query(example):
            if dataset == 'train':
                train.append(instance)
            elif dataset == 'dev':
                train.append(instance)
            elif dataset == 'test':
                test.append(instance)
            elif dataset == 'exclude':
                pass
            else:
                assert False, dataset

In [None]:
vocab_words, vocab_tags, vocab_templates = build_vocab(train)
UNK = vocab_words.w2i["<UNK>"]
NWORDS = vocab_words.size()
NTAGS = vocab_tags.size()
NTEMPLATES = vocab_templates.size()

print(f"Running with {NTEMPLATES} templates", f"{NWORDS} words", f"{NTAGS} tags", f"{UNK} ignored words")

In [None]:
## Building the model

In [None]:
model = dy.Model()
trainer = dy.SimpleSGDTrainer(model, learning_rate=0.01)
DIM_WORD = 128
DIM_HIDDEN_LSTM = 64
DIM_HIDDEN_MLP = 32
DIM_HIDDEN_TEMPLATE = 64

In [None]:
# saver = tf.train.import_meta_graph('advising-question-split-model/model.ckpt-35000.meta')

In [None]:
# with tf.Session() as sess:
#     new_saver = tf.train.import_meta_graph('advising-question-split-model/model.ckpt-35000.meta')
#     new_saver.restore(sess, tf.train.latest_checkpoint('advising-question-split-model/'))

In [None]:
# # Build a Basic RNN

# def enc_dec_model_inputs:
#     return 

# def encoding_layer:
#     return

# def process_decoder_input:
#     return

# def decoding_layer_train:
#     return

# def decoding_layer_infer:
#     return

# def decoding_layer:
#     return

# def seq2seq_model:
#     return
