## Stackoverflow

### Importing Libs

In [1]:
import collections
import pathlib

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as tf_text
from tensorflow.keras import layers, losses, utils
from tensorflow.keras.layers import TextVectorization

2023-05-19 07:54:36.951087: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-19 07:54:37.188522: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-19 07:54:37.191083: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Explore Data

In [3]:
data_url = "https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz"

dataset_dir = utils.get_file(
    origin=data_url, untar=True, cache_dir="stack_overflow", cache_subdir=""
)

dataset_dir = pathlib.Path(dataset_dir).parent

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/stack_overflow_16k.tar.gz


In [4]:
list(dataset_dir.iterdir())

[PosixPath('/tmp/.keras/aclImdb'),
 PosixPath('/tmp/.keras/train'),
 PosixPath('/tmp/.keras/aclImdb_v1.tar.gz'),
 PosixPath('/tmp/.keras/README.md'),
 PosixPath('/tmp/.keras/stack_overflow_16k.tar.gz'),
 PosixPath('/tmp/.keras/test')]

In [5]:
train_dir = dataset_dir / "train"
list(train_dir.iterdir())

[PosixPath('/tmp/.keras/train/python'),
 PosixPath('/tmp/.keras/train/java'),
 PosixPath('/tmp/.keras/train/csharp'),
 PosixPath('/tmp/.keras/train/javascript')]

In [6]:
sample_file = train_dir / "python/1755.txt"

with open(sample_file) as f:
    print(f.read())

why does this blank program print true x=true.def stupid():.    x=false.stupid().print x



### Load Data

In [7]:
SEED = 42
tf.keras.utils.set_random_seed(SEED)

In [8]:
batch_size = 32

raw_train_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset="training",
    seed=SEED,
)

Found 8000 files belonging to 4 classes.
Using 6400 files for training.


In [9]:
for text_batch, label_batch in raw_train_ds.take(1):
    for i in range(5):
        print("Question: ", text_batch.numpy()[i].decode("utf-8"))
        print("Label:", label_batch.numpy()[i])
        print("*" * 150)

Question:  "blank multiline textbox to .txt file i'm definitely using the wrong method. actually i'm trying to convert from java to blank and it's beginning to become tough ....anyway, i have a textbox1 that is multiline, i write to it by for looping an arraylist...the textbox1 looks like this:..website: https://google.dk.firmanavn: google llc.email: google@gmail.com.cvr: 123456.gscore: 1.glink: googlepagespeedlink...the code that i use right now, which manages to create a file, but it ends up empty. i am surely doing something wrong, and i'm unsure how to write the textbox to the file...private void button3_click(object sender, eventargs e).    {.        stream mystream;.        savefiledialog savefiledialog1 = new savefiledialog();..        savefiledialog1.filter = ""txt files (*.txt)|"";.        savefiledialog1.filterindex = 2;.        savefiledialog1.restoredirectory = true;...        if (savefiledialog1.showdialog() == dialogresult.ok).        {.            string path = path.getf

2023-05-19 07:56:57.067496: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6400]
	 [[{{node Placeholder/_0}}]]
2023-05-19 07:56:57.068508: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6400]
	 [[{{node Placeholder/_0}}]]


In [64]:
for i, label in enumerate(raw_train_ds.class_names):
    print("Label", i, "corresponds to", label)

Label 0 corresponds to csharp
Label 1 corresponds to java
Label 2 corresponds to javascript
Label 3 corresponds to python


In [67]:
raw_val_ds = utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset="validation",
    seed=SEED,
)

Found 8000 files belonging to 4 classes.
Using 1600 files for validation.


In [68]:
test_dir = dataset_dir / "test"
raw_test_ds = utils.text_dataset_from_directory(test_dir, batch_size=batch_size)

Found 8000 files belonging to 4 classes.


### Data preparation

In [72]:
VOCAB_SIZE = 10000

multi_hot_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE, output_mode="multi_hot"
)

In [73]:
MAX_SEQUENCE_LENGTH = 250

int_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LENGTH,
)

In [75]:
train_text = raw_train_ds.map(lambda text, labels: text)
multi_hot_vectorize_layer.adapt(train_text)
int_vectorize_layer.adapt(train_text)

2023-05-19 08:14:28.629284: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6400]
	 [[{{node Placeholder/_0}}]]
2023-05-19 08:14:28.630009: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6400]
	 [[{{node Placeholder/_0}}]]


In [76]:
def multi_hot_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return multi_hot_vectorize_layer(text), label

In [77]:
def int_vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return int_vectorize_layer(text), label

In [78]:
text_batch, label_batch = next(iter(raw_train_ds))
first_question, first_label = text_batch[0], label_batch[0]
print("Question", first_question)
print("Label", first_label)

Question tf.Tensor(b'"filter abstract classes from list i have a new list&lt;abstractclass&gt; () that contains some different implementations. i want to filter out some implementations due to specific conditions. how to do it properly?..1) if (condition1 &amp;&amp; type is implementation1) .....2) if (condition2 &amp;&amp; implementation.name == ""implementation 1"") .....3) if (condition3 &amp;&amp; implementation.type == enumtype.type1) ......i think 1) is bad, 2) does not work at compile time 3) may be good?..any suggestions for other desgin?..(coudn\'t post it on programmers.se because of ban)..edit:.more details. imagine abstract class (or interface):..class abstract messageprinter.{ .    void print (string message);.}...and class consoleprinter : messageprinter which prints message on console. so when due to some reasons i want to stop printing message on console, i need to remove that implementation from my list&lt;messageprinter&gt;. but what if consoleprinter is wrapped using

In [79]:
print(
    "'multi_hot' vectorized question:",
    multi_hot_vectorize_text(first_question, first_label)[0],
)

'multi_hot' vectorized question: tf.Tensor([[1. 0. 1. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)


In [81]:
print(
    "'int' vectorized question:",
    int_vectorize_text(first_question, first_label)[0],
)

'int' vectorized question: tf.Tensor(
[[ 899  537  363   31   54    3   17    5   15    1   14  296   83  177
  3659    3   46    4  899   94   83 3659  916    4  304 1724   24    4
    40   11    1   10 2723  143  122    6    1   57   10 3712  143    1
   799   25   92   10    1  143    1    1    3  291   25    6  860   57
   113   20  139   59  818  105   92  454   33    1  921   12  144    1
   576   11   37    1  193    9    1 1067 2700  537   29   45    1  537
  9421   42   75   18    1   29    1 9421   66  514  262   37  332   50
    44  916    4   83 1890    3   46    4  469  650  262   37  332    3
    78    4  278   14  799   31   23    1   26   55   10    1    6 4076
    47    1    1 9421   22    1 9421  313   42   75   18  262    1    1
    73   14  122  800   72   33 4880  299   92   21  847 2139    5  547
    26    6   20 1946    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0

In [82]:
print("1289 ---> ", int_vectorize_layer.get_vocabulary()[1289])
print("313 ---> ", int_vectorize_layer.get_vocabulary()[313])
print(f"Vocabulary size: {len(int_vectorize_layer.get_vocabulary())}")

1289 --->  roman
313 --->  source
Vocabulary size: 10000


In [83]:
multi_hot_train_ds = raw_train_ds.map(multi_hot_vectorize_text)
multi_hot_val_ds = raw_val_ds.map(multi_hot_vectorize_text)
multi_hot_test_ds = raw_test_ds.map(multi_hot_vectorize_text)

int_train_ds = raw_train_ds.map(int_vectorize_text)
int_val_ds = raw_val_ds.map(int_vectorize_text)
int_test_ds = raw_test_ds.map(int_vectorize_text)

### Configure dataset performance

In [84]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
    return dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [85]:
multi_hot_train_ds = configure_dataset(multi_hot_train_ds)
multi_hot_val_ds = configure_dataset(multi_hot_val_ds)
multi_hot_test_ds = configure_dataset(multi_hot_test_ds)

int_train_ds = configure_dataset(int_train_ds)
int_val_ds = configure_dataset(int_val_ds)
int_test_ds = configure_dataset(int_test_ds)

### Train model

In [86]:
bag_of_words_model = tf.keras.Sequential([layers.Dense(4)])

bag_of_words_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"],
)

history = bag_of_words_model.fit(
    multi_hot_train_ds, validation_data=multi_hot_val_ds, epochs=10
)

Epoch 1/10


2023-05-19 08:15:05.611817: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_17' with dtype resource
	 [[{{node Placeholder/_17}}]]
2023-05-19 08:15:05.612520: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [6400]
	 [[{{node Placeholder/_0}}]]




2023-05-19 08:15:06.792427: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]
2023-05-19 08:15:06.792801: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [87]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential(
        [
            layers.Embedding(vocab_size, 64, mask_zero=True),
            layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
            layers.GlobalMaxPooling1D(),
            layers.Dense(num_labels),
        ]
    )
    return model

In [27]:
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=4)
int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer="adam",
    metrics=["accuracy"],
)
history = int_model.fit(int_train_ds, validation_data=int_val_ds, epochs=5)

Epoch 1/5


2023-05-19 07:59:33.899371: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]
2023-05-19 07:59:33.900247: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype int32 and shape [6400]
	 [[{{node Placeholder/_4}}]]




2023-05-19 07:59:37.416445: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_17' with dtype resource
	 [[{{node Placeholder/_17}}]]
2023-05-19 07:59:37.417612: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [28]:
print("Linear model on binary vectorized data:")
print(bag_of_words_model.summary())

Linear model on binary vectorized data:
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 40004     
                                                                 
Total params: 40,004
Trainable params: 40,004
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

ConvNet model on int vectorized data:
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          640064    
                                                                 
 conv1d (Conv1D)             (None, None, 64)          20544     
                                                                 
 global_max_pooling1d (Globa  (None, 64)               0         
 lMaxPooling1D)                                                  
                                                                 
 dense_1 (Dense)             (None, 4)                 260       
                                                                 
Total params: 660,868
Trainable params: 660,868
Non-trainable params: 0
_________________________________________________________________
None


In [30]:
bag_of_words_loss, bag_of_words_accuracy = bag_of_words_model.evaluate(
    multi_hot_test_ds
)
int_loss, int_accuracy = int_model.evaluate(int_test_ds)

print(f"Bag of words model accuracy: {bag_of_words_accuracy:2.2%}")
print(f"Int model accuracy: {int_accuracy:2.2%}")

 14/250 [>.............................] - ETA: 0s - loss: 0.5062 - accuracy: 0.8103 

2023-05-19 08:00:02.362915: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_18' with dtype int64
	 [[{{node Placeholder/_18}}]]
2023-05-19 08:00:02.363980: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_17' with dtype resource
	 [[{{node Placeholder/_17}}]]


 18/250 [=>............................] - ETA: 1s - loss: 0.5101 - accuracy: 0.8264

2023-05-19 08:00:03.587790: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_17' with dtype resource
	 [[{{node Placeholder/_17}}]]
2023-05-19 08:00:03.588171: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]


Bag of words model accuracy: 81.41%
Int model accuracy: 80.92%


### Export model

In [31]:
export_model = tf.keras.Sequential(
    [
        multi_hot_vectorize_layer,
        bag_of_words_model,
        layers.Activation("sigmoid"),
    ]
)

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"],
)

loss, accuracy = export_model.evaluate(raw_test_ds)
print(f"Accuracy: {bag_of_words_accuracy:2.2%}")

2023-05-19 08:00:15.835836: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [8000]
	 [[{{node Placeholder/_0}}]]
2023-05-19 08:00:15.836159: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype int32 and shape [8000]
	 [[{{node Placeholder/_4}}]]


Accuracy: 81.41%


In [32]:
def get_string_labels(predicted_scores_batch):
    predicted_int_labels = tf.math.argmax(predicted_scores_batch, axis=1)
    predicted_labels = tf.gather(raw_train_ds.class_names, predicted_int_labels)
    return predicted_labels

### Inference

In [33]:
inputs = [
    "how do I extract keys from a dict into a list?",  # 'python'
    "debug public static void main(string[] args) {...}",  # 'java'
]
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)
for input, label in zip(inputs, predicted_labels):
    print("Question: ", input)
    print("Predicted label: ", label.numpy())

Question:  how do I extract keys from a dict into a list?
Predicted label:  b'python'
Question:  debug public static void main(string[] args) {...}
Predicted label:  b'java'


### Author of Illiad translations

### Explore Data

In [38]:
DIRECTORY_URL = "file:///neuralize/notebooks/datasets/illiad/"
FILE_NAMES = ["cowper.txt", "derby.txt", "butler.txt"]

for name in FILE_NAMES:
    text_dir = utils.get_file(name, origin=DIRECTORY_URL + name)

parent_dir = pathlib.Path(text_dir).parent
list(parent_dir.iterdir())

Downloading data from file:///neuralize/notebooks/datasets/illiad/cowper.txt
Downloading data from file:///neuralize/notebooks/datasets/illiad/derby.txt
Downloading data from file:///neuralize/notebooks/datasets/illiad/butler.txt


[PosixPath('/root/.keras/datasets/butler.txt'),
 PosixPath('/root/.keras/datasets/cowper.txt'),
 PosixPath('/root/.keras/datasets/derby.txt')]

In [39]:
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

In [40]:
labeled_data_sets = []

for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(str(parent_dir / file_name))
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

In [41]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000

In [42]:
all_labeled_data = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)

all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False
)

In [43]:
for text, label in all_labeled_data.take(10):
    print("Sentence: ", text.numpy())
    print("Label:", label.numpy())

2023-05-19 08:07:23.828818: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype string and shape [1]
	 [[{{node Placeholder/_9}}]]


Sentence:  b'reach the ships, and till night falls at the going down of the sun."'
Label: 2
Sentence:  b'For I profess some courage, even I.'
Label: 0
Sentence:  b'To whom he gives the prize of victory,'
Label: 1
Sentence:  b"Old Chryses followed to Achaia's camp,"
Label: 0
Sentence:  b'a god, and with the captains of the Cretans round him. Often did'
Label: 2
Sentence:  b'Upon the threshold pausing, thus he spoke:'
Label: 1
Sentence:  b'battalions. The battle was now in array and they stood face to face'
Label: 2
Sentence:  b'your eyes none of the sharpest, but you are always laying down the law.'
Label: 2
Sentence:  b"And, as her hand she clasp'd, address'd her thus:"
Label: 1
Sentence:  b'fighting till we burn our dead; hereafter we will fight anew, till'
Label: 2


### Data Prep

In [44]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [45]:
def tokenize(text, unused_label):
    lower_case = tf_text.case_fold_utf8(text)
    return tokenizer.tokenize(lower_case)

In [46]:
tokenized_ds = all_labeled_data.map(tokenize)

In [47]:
for text_batch in tokenized_ds.take(5):
    print("Tokens: ", text_batch.numpy())

2023-05-19 08:07:54.259431: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype string and shape [1]
	 [[{{node Placeholder/_9}}]]


Tokens:  [b'reach' b'the' b'ships' b',' b'and' b'till' b'night' b'falls' b'at'
 b'the' b'going' b'down' b'of' b'the' b'sun' b'."']
Tokens:  [b'for' b'i' b'profess' b'some' b'courage' b',' b'even' b'i' b'.']
Tokens:  [b'to' b'whom' b'he' b'gives' b'the' b'prize' b'of' b'victory' b',']
Tokens:  [b'old' b'chryses' b'followed' b'to' b'achaia' b"'" b's' b'camp' b',']
Tokens:  [b'a' b'god' b',' b'and' b'with' b'the' b'captains' b'of' b'the'
 b'cretans' b'round' b'him' b'.' b'often' b'did']


In [48]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
    for tok in toks:
        vocab_dict[tok] += 1

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

2023-05-19 08:08:03.337085: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype string and shape [1]
	 [[{{node Placeholder/_4}}]]


Vocab size:  10000
First five vocab entries: [b',', b'the', b'and', b"'", b'of']


In [49]:
keys = vocab
values = range(
    2, len(vocab) + 2
)

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64
)

num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

In [50]:
def preprocess_text(text, label):
    standardized = tf_text.case_fold_utf8(text)
    tokenized = tokenizer.tokenize(standardized)
    vectorized = vocab_table.lookup(tokenized)
    return vectorized, label

In [51]:
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())

2023-05-19 08:08:25.792617: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype string and shape [1]
	 [[{{node Placeholder/_9}}]]


Sentence:  b'reach the ships, and till night falls at the going down of the sun."'
Vectorized sentence:  [ 359    3   68    2    4  162  257 1139   34    3  749  112    6    3
  537   52]


In [52]:
all_encoded_data = all_labeled_data.map(preprocess_text)

### Splitting Data

In [53]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [54]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [55]:
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

2023-05-19 08:09:24.152026: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype string and shape [1]
	 [[{{node Placeholder/_4}}]]
2023-05-19 08:09:24.155205: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_9' with dtype string and shape [1]
	 [[{{node Placeholder/_9}}]]


Text batch shape:  (64, 17)
Label batch shape:  (64,)
First text example:  tf.Tensor(
[ 359    3   68    2    4  162  257 1139   34    3  749  112    6    3
  537   52    0], shape=(17,), dtype=int64)
First label example:  tf.Tensor(2, shape=(), dtype=int64)


In [56]:
vocab_size += 2

In [57]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

### Training

In [58]:
model = create_model(vocab_size=vocab_size, num_labels=3)

model.compile(
    optimizer="adam",
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

history = model.fit(train_data, validation_data=validation_data, epochs=3)

Epoch 1/3


2023-05-19 08:09:46.424592: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_4' with dtype string and shape [1]
	 [[{{node Placeholder/_4}}]]
2023-05-19 08:09:46.425202: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-05-19 08:09:57.141756: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 5189 of 50000
2023-05-19 08:10:07.140888: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:392] Filling up shuffle buffer (this may take a while): 17181 of 50000
2023-05-19 08:1

     28/Unknown - 45s 6ms/step - loss: 1.0405 - accuracy: 0.4023

2023-05-19 08:10:31.564174: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:417] Shuffle buffer filled.


    688/Unknown - 49s 5ms/step - loss: 0.5192 - accuracy: 0.7669

2023-05-19 08:10:35.621754: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_20' with dtype int64
	 [[{{node Placeholder/_20}}]]
2023-05-19 08:10:35.622283: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_19' with dtype resource
	 [[{{node Placeholder/_19}}]]


Epoch 2/3
Epoch 3/3


In [59]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print(f"Accuracy: {accuracy:2.2%}")

Loss:  0.37869754433631897
Accuracy: 84.90%


### Export model

In [60]:
preprocess_layer = TextVectorization(
    max_tokens=vocab_size,
    standardize=tf_text.case_fold_utf8,
    split=tokenizer.tokenize,
    output_mode="int",
    output_sequence_length=MAX_SEQUENCE_LENGTH,
)

preprocess_layer.set_vocabulary(vocab)

In [61]:
export_model = tf.keras.Sequential(
    [preprocess_layer, model, layers.Activation("sigmoid")]
)

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"],
)

In [62]:
test_ds = all_labeled_data.take(VALIDATION_SIZE).batch(BATCH_SIZE)
test_ds = configure_dataset(test_ds)

loss, accuracy = export_model.evaluate(test_ds)

print("Loss: ", loss)
print(f"Accuracy: {accuracy:2.2%}")

2023-05-19 08:10:51.000393: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [1]
	 [[{{node Placeholder/_0}}]]
2023-05-19 08:10:51.331143: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'text_vectorization_2/UnicodeScriptTokenize/RaggedGather/cond/zeros/Reshape/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/strided_slice' with dtype int64
	 [[{{node text_vectorization_2/UnicodeScriptTokenize/RaggedGather/cond/zeros/Reshape/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/strided_slice}}]]
2023-05-19 08:10:51.336467: I tensorflow/core/common_runtime/executor.cc:

Loss:  0.5040056705474854
Accuracy: 80.02%


### Inference

In [63]:
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]

predicted_scores = export_model.predict(inputs)
predicted_labels = tf.math.argmax(predicted_scores, axis=1)

for input, label in zip(inputs, predicted_labels):
    print("Question: ", input)
    print("Predicted label: ", label.numpy())

2023-05-19 08:10:57.682583: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'sequential_4/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/cond/zeros/Reshape/sequential_4/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/strided_slice' with dtype int64
	 [[{{node sequential_4/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/cond/zeros/Reshape/sequential_4/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/strided_slice}}]]
2023-05-19 08:10:57.689666: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'sequential_4/text_vectorization_2/UnicodeScriptTokenize/RaggedGather/cond/cond/range/sequential_4/t

Question:  Join'd to th' Ionians with their flowing robes,
Predicted label:  1
Question:  the allies, and his armour flashed about him so that he seemed to all
Predicted label:  2
Question:  And with loud clangor of his arms he fell.
Predicted label:  0
