In [23]:
import json

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

# Suppress warnings
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

In [24]:
class Schema:
    def __init__ (self, id):
        self.id = id
        self.tableNames = []
        self.columnNames = []
        self.columnAtributes = []

In [25]:
stopWordSet = nltk.corpus.stopwords.words('english')
stemmer = nltk.SnowballStemmer(language='english')
tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z0-9]+")

# Data Analysis and Preprocessing

We note that the databases with the largest amounts of queries in Spider are college_2/college_1, hr_1, store_1, and soccer_2.
We will create 4 models applied on each of these databases.

In [26]:
# Loading Spider Data
with open('./spider-data/tables.json', 'r') as f:
  schemaDoc = json.load(f)

with open('./spider-data/train_spider.json', 'r') as f:
  queryDoc = json.load(f)

schemas = {}
db_counts = {}

for idx, iquery in enumerate(queryDoc):
    db_counts[iquery['db_id']] = db_counts.get(iquery['db_id'], 0) + 1

print(sorted(db_counts.items(), key=lambda x:x[1], reverse=True))

[('college_2', 170), ('college_1', 164), ('hr_1', 124), ('store_1', 112), ('soccer_2', 106), ('bike_1', 104), ('music_1', 100), ('hospital_1', 100), ('music_2', 100), ('dorm_1', 100), ('allergy_1', 98), ('movie_1', 98), ('flight_1', 96), ('driving_school', 93), ('cre_Doc_Tracking_DB', 90), ('department_store', 88), ('customers_and_addresses', 88), ('activity_1', 88), ('network_2', 86), ('products_gen_characteristics', 86), ('game_1', 86), ('chinook_1', 84), ('cre_Theme_park', 84), ('cre_Docs_and_Epenses', 84), ('customers_and_invoices', 82), ('sakila_1', 82), ('baseball_1', 82), ('e_learning', 82), ('cre_Drama_Workshop_Groups', 82), ('wine_1', 82), ('flight_4', 82), ('customers_card_transactions', 80), ('apartment_rentals', 80), ('formula_1', 80), ('loan_1', 80), ('manufactory_1', 80), ('tracking_grants_for_research', 78), ('inn_1', 74), ('college_3', 74), ('voter_2', 72), ('csu_1', 70), ('club_1', 70), ('election', 68), ('student_1', 68), ('icfp_1', 66), ('music_4', 60), ('tracking_or

### Create outputs

We first filter the query and schema doc entries to the queries linked to the DB labeled by `schema`.
In our application, it will be `college_2`, `hr_1`, `store_1`, and `soccer_2`.

Then we create our outputs (y) for the model. The outputs are in the following format:

e.g. `[0,0,0,1,0,1,1,...]`

Where the *nth* index corresponds to the *nth* table in the schema. The value contains a 1 if the table is referenced in the SQL query, and a 0 if it is not.

In [27]:
def getEmbeddings(schemaDoc, queryDoc, schemaName):

    filteredQueryDoc = [query for query in queryDoc if query['db_id'] == schemaName]
    filteredSchemaDoc = [schema for schema in schemaDoc if schema['db_id'] == schemaName]

    for idx, iSchema in enumerate(filteredSchemaDoc):
        dbid = iSchema['db_id']
        test = Schema(dbid)
        for columnName in iSchema['table_names_original']:
            test.tableNames.append(columnName)
        for columnName in iSchema['column_names_original']:
            test.columnNames.append(columnName[1])
            test.columnAtributes.append(columnName[0])
        # print("Schema", idx, ":", test.id, ":", test.tableNames)
        schemas[dbid] = test

    tables = []
    columns = []

    for idx, iquery in enumerate(filteredQueryDoc):
        # print(schemas)
        # Initialize a blank array with the length of the number of tables
        tablesArray = [0] * len(schemas[iquery['db_id']].tableNames)
        columnsArray = [0] * len(schemas[iquery['db_id']].columnNames)
        # Get all the tokens from teh particular query we are looking at
        queryTokens = iquery['query_toks_no_value']

        # Search through all the tokens in the given query
        for idx, queryToken in enumerate(queryTokens):
            # Only look at tokens that come after the word "from," since those are the table names
            if (idx > 0) & (queryTokens[idx - 1].lower() == "from") | (queryTokens[idx - 1].lower() == "join"):
                # Search through all of the table names in the database we are looking at

                # TODO: Use spacy to tag all table names using named entity recognition (NER).
                for jdx, tableName in enumerate(schemas[iquery['db_id']].tableNames):
                    # If the table names match, indicate it in tablesArray
                    if tableName.lower() == queryToken.lower():
                        tablesArray[jdx] = 1
            else:
                # Search through all of the column names in the database we are looking at
                for jdx, tableName in enumerate(schemas[iquery['db_id']].columnNames):
                    # If the table names match, indicate it in tablesArray
                    if tableName.lower() == queryToken.lower():
                        columnsArray[jdx] = 1
    
        tables.append(tablesArray)
        columns.append(columnsArray)
    
    return np.array(tables), np.array(columns)

Finally, we create the Pandas Dataframe for our training set, then use `train_test_split` to create training and testing data.

e.g.

| x     | y |
|----------|-----|
| "Give me a list of all users over the age of 20"     | [0,1,1,0,0,1]  |
| "Can you give me a list of the student rosters sorted in alphabetical order?"     | [0,1,1,1]  |

In [31]:
def preprocess_question(question):
    
    bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4', trainable=True)

    # Tokenize the input question using BERT tokenizer
    tokenizer = bert_layer.resolved_object.vocab
    question_tokens = tokenizer.tokenize(question)
    question_tokens = ["[CLS]"] + question_tokens + ["[SEP]"]
    question_token_ids = tokenizer.convert_tokens_to_ids(question_tokens)

    # Pad the input question to the maximum sequence length expected by BERT
    max_seq_length = 128
    question_input_ids = question_token_ids + [0] * (max_seq_length - len(question_token_ids))
    question_input_mask = [1] * len(question_token_ids) + [0] * (max_seq_length - len(question_token_ids))
    question_segment_ids = [0] * max_seq_length

    return np.array([question_input_ids]), np.array([question_input_mask]), np.array([question_segment_ids])

In [32]:
from sklearn.preprocessing import StandardScaler

trainDF = pd.DataFrame

def loadTrainDF(schemaDoc, queryDoc, schemaName):
 
    tableEmbeddings, columnEmbeddings = getEmbeddings(schemaDoc, queryDoc, schemaName)
    embeddings = np.concatenate((tableEmbeddings, columnEmbeddings),axis=1)

    questions = []
    question_masks = []
    question_segment_ids = []

    filteredQueryDoc = [query for query in queryDoc if query['db_id'] == schemaName]

    # Input processing (Question)
    for query in filteredQueryDoc:

        question = query['question']
        processed_question, mask, segment_id = preprocess_question(question)

        # # FIXME: How to preprocess the questions?
        # # Stem and tokenize each word in the question + remove all stopwords from the question.
        # words = tokenizer.tokenize(question)
        # processed_words = [stemmer.stem(word) for word in words if word.lower() not in stopWordSet]
        # # print(processed_words)
        # processed_question = ' '.join(processed_words)
        questions.append(processed_question)
        question_masks.append(mask)
        question_segment_ids.append(segment_id)

    trainDF = pd.DataFrame(data={'question': questions, 'question_mask': question_masks, 'question_segment_id': question_segment_ids})

    # Output processing (Table labels vector)
    for i in range(embeddings.shape[1]):
        trainDF[i] = embeddings[:,i]        

    return trainDF

In [33]:
# Example: college_2
trainDF = loadTrainDF(schemaDoc, queryDoc, 'college_2')
trainDF.to_csv('trainDF.csv')

AttributeError: '_UserObject' object has no attribute 'vocab'

In [9]:
from sklearn.model_selection import train_test_split

# Split it into train / test subsets
X_train, X_test, y_train, y_test = train_test_split(trainDF[['question','question_mask','question_segment_id']], trainDF.drop(['question','question_mask','question_segment_id'],axis=1), test_size=0.2,
                                                            random_state=42)

# FIXME: Anyway to add a feature similar to 'stratify=trainDF['question']', without breaking?

# Modeling

Prepare our model by loading BERT, and creating the model architecture.

In [10]:
def get_sentence_embeddings(bert_preprocess_model, bert_encoder_model, sentences = []):
    
    text_preprocessed = bert_preprocess_model(sentences)
    return bert_encoder_model(text_preprocessed)['pooled_output']

In [11]:
# Code Adopted and modified from YouTube video: https://www.youtube.com/watch?v=7kLi8u2dJz0&t=719s
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [12]:
bert_preprocess_model = hub.KerasLayer(preprocess_url)
bert_model = hub.KerasLayer(encoder_url)

2023-05-09 11:35:53.693491: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:35:53.693763: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:35:53.693842: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:35:53.694072: I tensorflow/core/

In [17]:
# ! Code Adopted and Modified from https://github.com/eclarson/MachineLearningNotebooks and https://www.youtube.com/watch?v=hOCDJyZ6quA
# Notebook 13: RNN Basics
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input

def build_model(NUM_CLASSES=1):
    # BERT Layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
    preprocessed_text = bert_preprocess_model(text_input)
    outputs = bert_model(preprocessed_text)

    # NN Layers
    l = tf.keras.layers.Dropout(0.2, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(512, activation='tanh', name='hidden1', kernel_regularizer='l2')(l)
    l = tf.keras.layers.Dropout(0.2, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(256, activation='tanh', name='hidden1', kernel_regularizer='l2')(l)
    l = tf.keras.layers.Dense(128, activation='tanh', name='hidden2', kernel_regularizer='l2')(l)
    l = tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid', name='output')(l)

    # Final Model
    model = tf.keras.Model(inputs=[text_input], outputs=[l])

    METRICS = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)

    return model

Alternative model created by ChatGPT. Will compare with my implementation.

Prompt: You are a machine learning expert. I need Python code that uses Tensorflow and BERT to take in a natural language question as input  (for example, Who is the instructor with the highest salary?) and outputs a vector of ones and zeros (for example, [0, 1, 1, 0, 0]) where each entry in the vector is whether a table in a particular database schema is mentioned in the natural language question or not. The output could also be many dimensions if that would be easier, where each dimension is either a 1 or 0 where each entry in the vector is whether a table in a particular database schema is mentioned in the natural language question or not. This model is to be applied on a single database schema and should therefore each output should have the same number of dimensions.

In [None]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3', trainable=True)

# Tokenize the input question using BERT tokenizer
tokenizer = bert_layer.resolved_object.vocab
question_tokens = tokenizer.tokenize(question)
question_tokens = ["[CLS]"] + question_tokens + ["[SEP]"]
question_token_ids = tokenizer.convert_tokens_to_ids(question_tokens)

# Pad the input question to the maximum sequence length expected by BERT
max_seq_length = 128
question_input_ids = question_token_ids + [0] * (max_seq_length - len(question_token_ids))
question_input_mask = [1] * len(question_token_ids) + [0] * (max_seq_length - len(question_token_ids))
question_segment_ids = [0] * max_seq_length

# Create a function to predict the table mentions from the BERT output
def build_model(NUM_CLASSES=1):
    inputs = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32)

    bert_output = bert_layer([inputs, input_mask, segment_ids])[0]
    pooled_output = tf.keras.layers.GlobalMaxPool1D()(bert_output)
    dense_layer = tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid')(pooled_output)
    
    model = tf.keras.models.Model(inputs=[inputs, input_mask, segment_ids], outputs=[dense_layer])

    return model

In [18]:
print(X_train.shape)
print(y_train.shape)

model = build_model(y_train.shape[1])
model.fit(X_train, y_train, epochs=3)
model.save('college_2')

(136,)
(136, 58)
Epoch 1/3


2023-05-09 11:37:51.446498: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-09 11:37:51.495605: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-05-09 11:37:51.495667: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node

Epoch 2/3
Epoch 3/3


2023-05-09 11:38:22.366917: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-09 11:38:22.446700: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-09 11:38:22.458043: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-09 11:38:22.498783: I ten

INFO:tensorflow:Assets written to: college_2/assets


INFO:tensorflow:Assets written to: college_2/assets


Repeating with the other 3 databases:

In [66]:
trainDF = loadTrainDF(schemaDoc, queryDoc, 'college_2')

# hr_1, store_1, and soccer_2
trainDF_hr = loadTrainDF(schemaDoc, queryDoc, 'hr_1')
X_train, X_test, y_train, y_test = train_test_split(trainDF_hr['question'], trainDF_hr.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_hr = build_model(y_train.shape[1])
model_hr.fit(X_train, y_train, epochs=5)
model_hr.save('hr_1')

trainDF_store = loadTrainDF(schemaDoc, queryDoc, 'store_1')
X_train, X_test, y_train, y_test = train_test_split(trainDF_store['question'], trainDF_store.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_store = build_model(y_train.shape[1])
model_store.fit(X_train, y_train, epochs=5)
model_store.save('store_1')

trainDF_soccer = loadTrainDF(schemaDoc, queryDoc, 'soccer_2')
X_train, X_test, y_train, y_test = train_test_split(trainDF_soccer['question'], trainDF_soccer.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_soccer = build_model(y_train.shape[1])
model_soccer.fit(X_train, y_train, epochs=5)
model_soccer.save('soccer_2')

Epoch 1/5


2023-05-09 01:15:49.366984: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-09 01:15:49.422861: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-05-09 01:15:49.422925: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node

Epoch 2/5


KeyboardInterrupt: 

In [19]:
print('Loading College Model...')
college_model = tf.keras.models.load_model('college_2')

Loading College Model...


2023-05-09 11:38:38.264787: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:38:38.265066: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:38:38.265148: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 11:38:38.265393: I tensorflow/core/

In [16]:
print('Loading HR Model...')
hr_model = tf.keras.models.load_model('hr_1')
print('Loading Soccer Model...')
soccer_model = tf.keras.models.load_model('soccer_2')
print('Loading Store Model...')
store_model = tf.keras.models.load_model('store_1')

Loading College Model...


2023-05-09 00:43:20.277814: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:20.278167: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:20.278247: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:20.278551: I tensorflow/core/

Loading HR Model...


2023-05-09 00:43:26.869968: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:26.870326: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:26.870407: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:26.870715: I tensorflow/core/

Loading Soccer Model...


2023-05-09 00:43:33.423407: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:33.423758: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:33.423838: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:33.424146: I tensorflow/core/

Loading Store Model...


2023-05-09 00:43:39.709663: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:39.710021: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:39.710102: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-09 00:43:39.710420: I tensorflow/core/

In [22]:
import random
import sys

def randomTest(model, model_name, schemaDoc, queryDoc):
    choice = 1

    tableEmbeddings, columnEmbeddings = getEmbeddings(schemaDoc, queryDoc, model_name)
    embeddings = np.concatenate((tableEmbeddings, columnEmbeddings),axis=1)
    # np.set_printoptions(threshold=sys.maxsize)

    ridx = random.randint(0, embeddings.shape[0])
    target_vect = embeddings[ridx]

    # model_name == db_id
    while choice == 1:

        schemas = {}

        for idx, iSchema in enumerate(schemaDoc):
            dbid = iSchema['db_id']
            schemaObj = Schema(dbid)
            for columnName in iSchema['table_names_original']:
                schemaObj.tableNames.append(columnName)
            for columnName in iSchema['column_names_original']:
                schemaObj.columnNames.append(columnName[1])
                schemaObj.columnAtributes.append(columnName[0])

            schemas[dbid] = schemaObj

        allNames = schemas[model_name].tableNames + schemas[model_name].columnNames
        tablesInQuestion = []
        columnsInQuestion = []

        print(schemas[model_name].tableNames)

        for i in range(len(target_vect)):
            if target_vect[i] == 1:
                if i < len(schemas[model_name].tableNames):
                    tablesInQuestion.append(allNames[i])
                else:
                    columnsInQuestion.append(allNames[i])

        question = input('Enter a question for DB '+model_name+' with the following characteristics: \nTables: '+str(tablesInQuestion)+'; Columns: '+str(columnsInQuestion))

        words = tokenizer.tokenize(question)
        processed_words = [stemmer.stem(word) for word in words if word.lower() not in stopWordSet]
        # print(processed_words)
        processed_question = ' '.join(processed_words)

        predictions = model.predict([processed_question])

        prediction_vect = []
        for prediction in predictions[0]:
            if prediction > 0.5:
                prediction_vect.append(1)
            else:
                prediction_vect.append(0)

        print('Question: '+question)
        print(predictions)
        print('***********************************************')
        print('PREDICTION:')
        print(prediction_vect)
        print('TARGET:')
        print(target_vect)

        choice = int(input('Repeat (1) or new random test (0)'))


while True:
    randomTest(college_model, 'college_2', schemaDoc, queryDoc)

['classroom', 'department', 'course', 'instructor', 'section', 'teaches', 'student', 'takes', 'advisor', 'time_slot', 'prereq']
Question: classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom classroom 
[[0.15410943 0.09741199 0.28269947 0.4000772  0.0972553  0.09487105
  0.29103547 0.07984108 0.13751864 0.08364235 0.13460648 0.19937822
  0.17230488 0.15687177 0.11836775 0.395916   0.18997735 0.12673144
  0.22216529 0.27905402 0.44046614 0.12696187 0.33876857 0.40189594
  0.43039954 0.15378535 0.18921882 0.13705923 0.16738704 0.17172946
  0.17972003 0.06912852 0.07874557 0.31421578 0.32854316 0.06691922
  0.24260859 0.1863203  0.29920134 0.317952   0.36932003 0.07692096
  0.24869633 0.3254153  0.10608226 0.18786171 0.23969868 0.09521768
  0.0736661  0.14312504 0.08012425 0.07811227 0.10399833 0.04434225
  0.05793766 0.10028602 0.30232757 0.08592953]]
*****************************************

ValueError: invalid literal for int() with base 10: ''