In [46]:
import json

import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [47]:
class Schema:
    def __init__ (self, id):
        self.id = id
        self.tableNames = []

In [48]:
stopWordSet = nltk.corpus.stopwords.words('english')
stemmer = nltk.SnowballStemmer(language='english')
tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z0-9]+")

# Data Analysis and Preprocessing

We note that the databases with the largest amounts of queries in Spider are college_2/college_1, hr_1, store_1, and soccer_2.
We will create 4 models applied on each of these databases.

In [49]:
# Loading Spider Data
with open('./spider-data/tables.json', 'r') as f:
  schemaDoc = json.load(f)

with open('./spider-data/train_spider.json', 'r') as f:
  queryDoc = json.load(f)

schemas = {}
db_counts = {}

for idx, iquery in enumerate(queryDoc):
    db_counts[iquery['db_id']] = db_counts.get(iquery['db_id'], 0) + 1

print(sorted(db_counts.items(), key=lambda x:x[1], reverse=True))

[('college_2', 170), ('college_1', 164), ('hr_1', 124), ('store_1', 112), ('soccer_2', 106), ('bike_1', 104), ('music_1', 100), ('hospital_1', 100), ('music_2', 100), ('dorm_1', 100), ('allergy_1', 98), ('movie_1', 98), ('flight_1', 96), ('driving_school', 93), ('cre_Doc_Tracking_DB', 90), ('department_store', 88), ('customers_and_addresses', 88), ('activity_1', 88), ('network_2', 86), ('products_gen_characteristics', 86), ('game_1', 86), ('chinook_1', 84), ('cre_Theme_park', 84), ('cre_Docs_and_Epenses', 84), ('customers_and_invoices', 82), ('sakila_1', 82), ('baseball_1', 82), ('e_learning', 82), ('cre_Drama_Workshop_Groups', 82), ('wine_1', 82), ('flight_4', 82), ('customers_card_transactions', 80), ('apartment_rentals', 80), ('formula_1', 80), ('loan_1', 80), ('manufactory_1', 80), ('tracking_grants_for_research', 78), ('inn_1', 74), ('college_3', 74), ('voter_2', 72), ('csu_1', 70), ('club_1', 70), ('election', 68), ('student_1', 68), ('icfp_1', 66), ('music_4', 60), ('tracking_or

### Create outputs

We first filter the query and schema doc entries to the queries linked to the DB labeled by `schema`.
In our application, it will be `college_2`, `hr_1`, `store_1`, and `soccer_2`.

Then we create our outputs (y) for the model. The outputs are in the following format:

e.g. `[0,0,0,1,0,1,1,...]`

Where the *nth* index corresponds to the *nth* table in the schema. The value contains a 1 if the table is referenced in the SQL query, and a 0 if it is not.

In [50]:
def getTableEmbeddings(schemaDoc, queryDoc, schemaName):

    filteredQueryDoc = [query for query in queryDoc if query['db_id'] == schemaName]
    filteredSchemaDoc = [schema for schema in schemaDoc if schema['db_id'] == schemaName]

    for idx, iSchema in enumerate(filteredSchemaDoc):
        dbid = iSchema['db_id']
        test = Schema(dbid)
        for columnName in iSchema['table_names_original']:
            test.tableNames.append(columnName)
        print("Schema", idx, ":", test.id, ":", test.tableNames)
        schemas[dbid] = test

    tables = []
    nl_queries = []

    for idx, iquery in enumerate(filteredQueryDoc):
        # print(schemas)
        # Initialize a blank array with the length of the number of tables
        tablesArray = [0] * len(schemas[iquery['db_id']].tableNames)
        # Get all the tokens from teh particular query we are looking at
        queryTokens = iquery['query_toks']

        # Search through all the tokens in the given query
        for idx, queryToken in enumerate(queryTokens):
            # Only look at tokens that come after the word "from," since those are the table names
            if (idx > 0) & (queryTokens[idx - 1].lower() == "from") | (queryTokens[idx - 1].lower() == "join"):
                # Search through all of the table names in the database we are looking at

                # TODO: Use spacy to tag all table names using named entity recognition (NER).
                for jdx, tableName in enumerate(schemas[iquery['db_id']].tableNames):
                    # If the table names match, indicate it in tablesArray
                    if tableName.lower() == queryToken.lower():
                        tablesArray[jdx] = 1
    
        tables.append(tablesArray)
    
    return np.array(tables)

Finally, we create the Pandas Dataframe for our training set, then use `train_test_split` to create training and testing data.

e.g.

| x     | y |
|----------|-----|
| "Give me a list of all users over the age of 20"     | [0,1,1,0,0,1]  |
| "Can you give me a list of the student rosters sorted in alphabetical order?"     | [0,1,1,1]  |

In [51]:
trainDF = pd.DataFrame

def loadTrainDF(schemaDoc, queryDoc, schemaName):
 
    tableEmbeddings = getTableEmbeddings(schemaDoc, queryDoc, schemaName)

    questions = []
    filteredQueryDoc = [query for query in queryDoc if query['db_id'] == schemaName]

    # Input processing (Question)
    for query in filteredQueryDoc:
        question = query['question']

        # Stem and tokenize each word in the question + remove all stopwords from the question.
        words = tokenizer.tokenize(question)
        processed_words = [stemmer.stem(word) for word in words if word.lower() not in stopWordSet]
        # print(processed_words)
        processed_question = ' '.join(processed_words)
        print(processed_question)
        print(type(processed_question))
        questions.append(processed_question)

    trainDF = pd.DataFrame(data={'question': questions})

    # Output processing (Table labels vector)
    for i in range(tableEmbeddings.shape[1]):
        trainDF[i] = tableEmbeddings[:,i]

    return trainDF

In [52]:
# Example: college_2
trainDF = loadTrainDF(schemaDoc, queryDoc, 'college_2')
trainDF

Schema 0 : college_2 : ['classroom', 'department', 'course', 'instructor', 'section', 'teaches', 'student', 'takes', 'advisor', 'time_slot', 'prereq']
find build room capac 50
<class 'str'>
distinct build capac greater 50
<class 'str'>
count number room lamberton build
<class 'str'>
mani classroom lamberton
<class 'str'>
name build depart whose budget averag budget
<class 'str'>
give name build depart greater averag budget
<class 'str'>
find room number room sit 50 100 student build
<class 'str'>
room number correspond build classroom seat 50 100 student
<class 'str'>
find name build depart highest budget
<class 'str'>
depart name correspond build depart greatest budget
<class 'str'>
name student highest total credit histori depart
<class 'str'>
give name student histori depart credit
<class 'str'>
mani room lamberton build
<class 'str'>
count number classroom lamberton
<class 'str'>
mani student advisor
<class 'str'>
count number student advisor
<class 'str'>
mani depart offer cours
<

Unnamed: 0,question,0,1,2,3,4,5,6,7,8,9,10
0,find build room capac 50,1,0,0,0,0,0,0,0,0,0,0
1,distinct build capac greater 50,1,0,0,0,0,0,0,0,0,0,0
2,count number room lamberton build,1,0,0,0,0,0,0,0,0,0,0
3,mani classroom lamberton,1,0,0,0,0,0,0,0,0,0,0
4,name build depart whose budget averag budget,0,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
165,name averag salari depart averag salari higher...,0,0,0,1,0,0,0,0,0,0,0
166,find name instructor salari greater least one ...,0,0,0,1,0,0,0,0,0,0,0
167,name instructor earn least one instructor biol...,0,0,0,1,0,0,0,0,0,0,0
168,find name instructor whose salari greater sala...,0,0,0,1,0,0,0,0,0,0,0


In [53]:
from sklearn.model_selection import train_test_split

# Split it into train / test subsets
X_train, X_test, y_train, y_test = train_test_split(trainDF['question'], trainDF.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)

# FIXME: Anyway to add a feature similar to 'stratify=trainDF['question']', without breaking?

# Modeling

Prepare our model by loading BERT, and creating the model architecture.

In [54]:
def get_sentence_embeddings(bert_preprocess_model, bert_encoder_model, sentences = []):
    
    text_preprocessed = bert_preprocess_model(sentences)
    return bert_encoder_model(text_preprocessed)['pooled_output']

In [55]:
# Code Adopted and modified from YouTube video: https://www.youtube.com/watch?v=7kLi8u2dJz0&t=719s
preprocess_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'
encoder_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4'

In [56]:
bert_preprocess_model = hub.KerasLayer(preprocess_url)
bert_model = hub.KerasLayer(encoder_url)

2023-05-08 02:12:28.718406: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-08 02:12:28.718685: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-08 02:12:28.718764: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'seq_length' with dtype int32
	 [[{{node seq_length}}]]
2023-05-08 02:12:28.718987: I tensorflow/core/

In [57]:
# ! Code Adopted and Modified from https://github.com/eclarson/MachineLearningNotebooks and https://www.youtube.com/watch?v=hOCDJyZ6quA
# Notebook 13: RNN Basics
from tensorflow.keras.layers import Dense, LSTM, Dropout, Input

def build_model(NUM_CLASSES=1):
    # BERT Layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
    preprocessed_text = bert_preprocess_model(text_input)
    outputs = bert_model(preprocessed_text)

    # NN Layers
    l = tf.keras.layers.Dropout(0.2, name="dropout")(outputs['pooled_output'])
    l = tf.keras.layers.Dense(256, activation='sigmoid', name='hidden1')(l)
    l = tf.keras.layers.Dense(128, activation='sigmoid', name='hidden2')(l)
    l = tf.keras.layers.Dense(NUM_CLASSES, activation='sigmoid', name='output')(l)

    # Final Model
    model = tf.keras.Model(inputs=[text_input], outputs=[l])

    METRICS = [
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=METRICS)

    return model

In [58]:
print(X_train.shape)
print(y_train.shape)

model = build_model(y_train.shape[1])
model.fit(X_train, y_train, epochs=3)
model.save('college_2')

(136,)
(136, 11)
Epoch 1/3


2023-05-08 02:12:33.611745: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-08 02:12:33.626583: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-08 02:12:33.668125: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-05-08 0

Epoch 2/3
Epoch 3/3


2023-05-08 02:13:07.933135: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-08 02:13:07.946073: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:13:07.987483: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype int32 and shape [?,128]
	 [[{{node inputs}}]]
2023-05-08 02:13:07.987539

INFO:tensorflow:Assets written to: college_2/assets


INFO:tensorflow:Assets written to: college_2/assets


Repeating with the other 3 databases:

In [61]:
trainDF = loadTrainDF(schemaDoc, queryDoc, 'college_2')

# hr_1, store_1, and soccer_2
trainDF_hr = loadTrainDF(schemaDoc, queryDoc, 'hr_1')
X_train, X_test, y_train, y_test = train_test_split(trainDF_hr['question'], trainDF_hr.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_hr = build_model(y_train.shape[1])
model_hr.fit(X_train, y_train, epochs=3)
model_hr.save('hr_1')

trainDF_store = loadTrainDF(schemaDoc, queryDoc, 'store_1')
X_train, X_test, y_train, y_test = train_test_split(trainDF_store['question'], trainDF_store.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_store = build_model(y_train.shape[1])
model_store.fit(X_train, y_train, epochs=3)
model_store.save('store_1')

trainDF_soccer = loadTrainDF(schemaDoc, queryDoc, 'soccer_2')
X_train, X_test, y_train, y_test = train_test_split(trainDF_soccer['question'], trainDF_soccer.drop('question',axis=1), test_size=0.2,
                                                            random_state=42)
model_soccer = build_model(y_train.shape[1])
model_soccer.fit(X_train, y_train, epochs=3)
model_soccer.save('soccer_2')

Schema 0 : college_2 : ['classroom', 'department', 'course', 'instructor', 'section', 'teaches', 'student', 'takes', 'advisor', 'time_slot', 'prereq']
find build room capac 50
<class 'str'>
distinct build capac greater 50
<class 'str'>
count number room lamberton build
<class 'str'>
mani classroom lamberton
<class 'str'>
name build depart whose budget averag budget
<class 'str'>
give name build depart greater averag budget
<class 'str'>
find room number room sit 50 100 student build
<class 'str'>
room number correspond build classroom seat 50 100 student
<class 'str'>
find name build depart highest budget
<class 'str'>
depart name correspond build depart greatest budget
<class 'str'>
name student highest total credit histori depart
<class 'str'>
give name student histori depart credit
<class 'str'>
mani room lamberton build
<class 'str'>
count number classroom lamberton
<class 'str'>
mani student advisor
<class 'str'>
count number student advisor
<class 'str'>
mani depart offer cours
<

2023-05-08 02:14:32.928545: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-08 02:14:32.977499: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-05-08 02:14:32.977566: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node

Epoch 2/3
Epoch 3/3


2023-05-08 02:14:56.233429: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:14:56.318494: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-08 02:14:56.331334: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:14:56.372601: I ten

INFO:tensorflow:Assets written to: hr_1/assets


INFO:tensorflow:Assets written to: hr_1/assets


Schema 0 : store_1 : ['artists', 'sqlite_sequence', 'albums', 'employees', 'customers', 'genres', 'invoices', 'media_types', 'tracks', 'invoice_lines', 'playlists', 'playlist_tracks']
list top 5 countri number invoic list countri name number invoic
<class 'str'>
top 5 countri number invoic mani
<class 'str'>
list top 8 countri gross total invoic size list countri name gross invoic size
<class 'str'>
name top 8 countri total invoic size size
<class 'str'>
list top 10 countri averag invoic size list countri name averag invoic size
<class 'str'>
name countri averag invoic size top countri size
<class 'str'>
find 5 custom recent purchas someth list custom first last name
<class 'str'>
first last name 5 custom purchas someth recent
<class 'str'>
find top 10 custom total number order list custom first last name number total order
<class 'str'>
top 10 custom first last name total number order mani order make
<class 'str'>
list top 10 custom total gross sale list custom first last name total g

2023-05-08 02:15:02.655648: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-08 02:15:02.712849: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-05-08 02:15:02.712911: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node

Epoch 2/3
Epoch 3/3


2023-05-08 02:15:24.567127: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:15:24.648607: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-08 02:15:24.660960: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:15:24.703157: I ten

INFO:tensorflow:Assets written to: store_1/assets


INFO:tensorflow:Assets written to: store_1/assets


Schema 0 : soccer_2 : ['College', 'Player', 'Tryout']
total enrol number colleg
<class 'str'>
mani student enrol colleg
<class 'str'>
averag enrol number
<class 'str'>
mani student averag colleg enrol
<class 'str'>
mani colleg total
<class 'str'>
mani differ colleg
<class 'str'>
mani player 1000 hour train
<class 'str'>
mani differ player train 1000 hour
<class 'str'>
mani colleg 15000 student
<class 'str'>
number colleg student popul greater 15000
<class 'str'>
averag train hour player
<class 'str'>
mani hour player train averag
<class 'str'>
find name train hour player whose hour 1500
<class 'str'>
name number hour spent train player train less 1500 hour
<class 'str'>
mani differ colleg attend tryout test
<class 'str'>
mani differ colleg repres tryout
<class 'str'>
uniqu type player posit tryout
<class 'str'>
differ type player posit
<class 'str'>
mani student got accept tryout
<class 'str'>
mani student receiv yes tryout
<class 'str'>
mani student whose play role goali
<class 'str'>

2023-05-08 02:15:29.718039: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder' with dtype string and shape [?]
	 [[{{node Placeholder}}]]
2023-05-08 02:15:29.767295: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_1' with dtype int32 and shape [?,128]
	 [[{{node Placeholder_1}}]]
2023-05-08 02:15:29.767363: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder_2' with dtype int32 and shape [?,128]
	 [[{{node

Epoch 2/3
Epoch 3/3


2023-05-08 02:15:49.766741: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:15:49.845847: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string and shape [?]
	 [[{{node inputs}}]]
2023-05-08 02:15:49.857746: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text' with dtype string and shape [?]
	 [[{{node text}}]]
2023-05-08 02:15:49.898143: I ten

INFO:tensorflow:Assets written to: soccer_2/assets


INFO:tensorflow:Assets written to: soccer_2/assets


In [None]:
rubbish_model = tf.keras.models.load_model('rubbish')

In [None]:
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf

stopWordSet = nltk.corpus.stopwords.words('english')
stemmer = nltk.SnowballStemmer(language='english')
tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z0-9]+")

def getTestData(inputfile):
    wordSense = 0
    lines = []
    senses = []

    for line in inputLines:
        if line == "1\n":
            wordSense = 1
        elif line == "2\n":
            wordSense = 2
        elif (line != "\n") & (wordSense != 0):
            lines.append(line.strip())
            senses.append(wordSense)
    trainDF = pd.DataFrame(data={'text': lines,'sense': senses})
    return trainDF

rubbishTestInput = open("testing/rubbish.txt", "r", encoding='utf-8')

lines = rubbishTestInput.readlines()
linesStripped = np.asarray([line.strip() for line in lines])

rubbish_predictions = rubbish_model.predict(linesStripped)
print(rubbish_predictions)

output_file = open('results/result_rubbish_Michael_Lennon.txt', "w", encoding='utf-8')

for prediction in rubbish_predictions:
    if prediction > 0.5:
        output_file.write('2\n')
    else:
        output_file.write('1\n')


# Rubbish Model.train()
rubbishTestInput.close()
output_file.close()