In [47]:
from tensorflow.keras import datasets
from tensorflow.keras import Sequential
from tensorflow.keras import models
from tensorflow.keras import utils
from tensorflow.keras.layers import Dense, Flatten
import numpy as np
# keras has several built in datasets for learning:
# refer to https://keras.io/api/datasets/
# each dataset has its own keyword arguments when we load it in

In [23]:
(imdb_train_data,imdb_train_labels),(imdb_test_data,imdb_test_labels) = datasets.imdb.load_data(num_words=1000,skip_top=50,maxlen=100)
# num_words: only load top 1000 frequent words
# skip_top: skip the top 50 most frequent words
# maxlen: each sample has a list of words, this sets its maxlen
# ...
(mnist_train_data, mnist_train_labels),(mnist_test_data, mnist_test_labels) = datasets.mnist.load_data()

In [24]:
def data_generator(train_data, train_labels, batch_size=1):
    while True:
        for step in range(train_labels.shape[0]//batch_size):
            yield (train_data[step*batch_size:(step+1)*batch_size],train_labels[step*batch_size:(step+1)*batch_size])
        permutes = np.random.permutation(range(train_labels.shape[0]))
        train_data = train_data[permutes]
        train_labels = train_labels[permutes]

def val_generator(test_data, test_labels, batch_size=1):
    for step in range(test_labels.shape[0]//batch_size):
            yield (test_data[step*batch_size:(step+1)*batch_size],test_labels[step*batch_size:(step+1)*batch_size])

In [25]:
data_gen = data_generator(mnist_train_data,mnist_train_labels,batch_size=64)

In [26]:
model = Sequential([Flatten(),Dense(64, activation='relu'),Dense(10, activation='softmax')])
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy'])

In [27]:
test_gen = val_generator(mnist_test_data,mnist_test_labels,batch_size=64)

In [37]:
model.fit(data_gen, steps_per_epoch=mnist_train_labels.shape[0]//64, epochs=10, validation_data=test_gen, validation_steps=mnist_test_labels.shape[0]//64//10)
# we used to need fit_generator, but now the same functionality is incorporated into fit
# note y doens't need to be specified when you use a generator or dataset object
# validation split cannot be used either (you need a validation generator or array)
# steps_per_epoch is the number of samples to read to finish 1 epoch (all training data)
# if not specified, it will exhaust the input iterable and then count 1 epoch
# batch_size is not specified because the data_gen already generate inputs in batches
# validation_steps controls how many steps of the validation generator we run 
# (if not specified, it exhaust it)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f86676ea890>

In [28]:
test_gen = val_generator(mnist_test_data,mnist_test_labels,batch_size=64)
# you don't have to specify steps if the generator can be exhausted.

In [29]:
model.evaluate(test_gen)



[0.2471640408039093, 0.942307710647583]

In [30]:
test_gen = val_generator(mnist_test_data,mnist_test_labels,batch_size=64)

In [31]:
preds = model.predict(test_gen)

In [28]:
model2 = models.clone_model(model)
model2.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['sparse_categorical_accuracy'])

In [53]:
numStep = mnist_test_labels.shape[0]//64
for epoch in range(10): 
    loss_sum,acc_sum = 0,0
    for step in range(numStep):
        train_x, train_y = next(data_gen)
        loss, acc = model2.train_on_batch(train_x,train_y)
        loss_sum+=loss
        acc_sum+=acc
    val_loss, val_acc = model2.test_on_batch(mnist_test_data, mnist_test_labels)
    print('training loss and acc on epoch:',epoch, 'is', loss_sum/numStep, acc_sum/numStep)
    print('validation loss and acc is',val_loss, val_acc)

training loss and acc on epoch: 0 is 15.025984675456316 0.7232572115384616
validation loss and acc is 4.842421054840088 0.8237000107765198
training loss and acc on epoch: 1 is 3.2034847186161923 0.8020833333333334
validation loss and acc is 1.7365195751190186 0.7680000066757202
training loss and acc on epoch: 2 is 1.279325463068791 0.7399839743589743
validation loss and acc is 1.054390788078308 0.7738999724388123
training loss and acc on epoch: 3 is 1.0081953029984083 0.7759415064102564
validation loss and acc is 0.9077612161636353 0.7901999950408936
training loss and acc on epoch: 4 is 0.8830241513175842 0.7858573717948718
validation loss and acc is 0.8889638781547546 0.795799970626831
training loss and acc on epoch: 5 is 0.8020729174216589 0.8060897435897436
validation loss and acc is 0.7860884666442871 0.8228999972343445
training loss and acc on epoch: 6 is 0.6296593815279312 0.8277243589743589
validation loss and acc is 0.733862042427063 0.8233000040054321
training loss and acc on 

In [5]:
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.preprocessing.image import ImageDataGenerator
# ImageDataGenerator is a class for quickly creating a generator object for images with 
# certain built-in preprocessing options.
# Remember that when we use generators, all preprocessing must be done on the fly
(cifar10_train_data, cifar10_train_label), (cifar10_test_data, cifar10_test_labels) = cifar10.load_data()

In [6]:
def custom_process(data):
    return np.repeat(np.mean(data, axis=-1)[...,np.newaxis],3,axis=-1)

In [7]:
imgGen = ImageDataGenerator(fill_mode='nearest',featurewise_center=True, featurewise_std_normalization=True, width_shift_range=0.3, height_shift_range=0.3, rotation_range=180, horizontal_flip=True, preprocessing_function=custom_process)
# fill_mode='nearest' means when some pixels are emptied due to transformation, 
# they will be filled with the value of teh closest pixel. 
# There're other choices as well: constant, reflect, wrap.
# featurewise_center centers all features at 0 (by subtracting feature-wise mean), 
# it needs precomputed mean.
# featurewise_std_normalization normalizes the feature. It needs a precomputed std 
# All these precomputation are done when we call imgGen.fit
# width_shift_range gives the percentage of max width that it randomly shifts on the fly
# rotation_range gives the max angle it can rotate from the current original (same in both dir)
# horizontal_flip is intuitive
# preprocessing_function takes any function that reads in the input of 1 sample and output 
# a processed sample image. It should not change the image's dimensions!

In [8]:
imgGen.fit(cifar10_train_data)
# this precomputes the mean and std in rolling fashion

In [9]:
train_gen = imgGen.flow(cifar10_train_data,cifar10_train_label,batch_size=64)
# flow method outputs the actual generator object/function.
# train_gen.n gives the total number of samples, so surely steps = train_gen.n//train_gen.batch_size 
# (it's an infinite generator)

In [10]:
# we could also use imgGen.flow_from_directory: there must be a parent directory and
# a bunch of subdirectories whereby the name of each subdirectory indicate the class
# that images in this subdirectory belong.
# we can specify target_size (height, width); 
# color_mode = 'grayscale'/'rgb'/'rgba' indicating the color mode of the output images.
# we can use classes to specify the list of subdirectory names and their class names.
# Alternatively, it infers the class names based on subdirectory names directly.
# the mapping between class name and index is in class_index
# class_mode indicate output label modes (categorical/sparse/binary)

In [11]:
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
# similar to images, we have a time-series generator to generate batches of time-series data
# a time series data has two components: a data and a target
# the data is a sequence of samples through time (with a certain length)
# the target is what we want the model of this sequence of samples to map to
# for time-series prediction (predicting next step), 
# this may well be the next sample in the sequence

In [12]:
from tensorflow.data import Dataset,TextLineDataset

In [17]:
dataset1 = Dataset.from_tensor_slices([1,2,3])
print(dataset1.element_spec)
dataset2 = Dataset.from_tensor_slices(([[1,2],[2,3],[3,4]],[1,2,3]))
print(list(dataset2.as_numpy_iterator()))
dataset3 = Dataset.from_tensor_slices([((1,2),(2,3),(3,4)),((2,3),(2,3),(3,4))])
print(dataset3.element_spec)
# a tuple is interpreted very differently from a numpy or python list
# both numpy array and python lists are interpreted as a tensor.
# this method always slices the tensors in their first dimensions.
# if we put in a tuple of tensors, it slices each independently 
# and then zip them together in batches of tuples of tensors.
# The tensors provided must have equal number of first dimensions (the number of samples).
# In each tensor we slice (they are sliced independently), each element should have same shape.
# A dictoinary is interpreted similar to a tuple: it will slice each dictionary item separately
# and generate a dataset of dictionary of elements 
# (the resulting dataset elements is accessed via dictionary keys, not indices). 

TensorSpec(shape=(), dtype=tf.int32, name=None)
[(array([1, 2], dtype=int32), 1), (array([2, 3], dtype=int32), 2), (array([3, 4], dtype=int32), 3)]
TensorSpec(shape=(3, 2), dtype=tf.int32, name=None)


In [14]:
# instead of slicing a tuple of tensors, we can also slice them separately and then zip them in a tuple.
zippedData = Dataset.zip((dataset1,dataset2))
print(zippedData.element_spec)

(TensorSpec(shape=(), dtype=tf.int32, name=None), (TensorSpec(shape=(2,), dtype=tf.int32, name=None), TensorSpec(shape=(), dtype=tf.int32, name=None)))


In [15]:
dataFromTextFile = TextLineDataset(['textFiles/text1.txt','textFiles/text2.txt','textFiles/text3.txt'])
# it simply goes into each file and read each line as a separate sample where the element is the string.
list(dataFromTextFile.as_numpy_iterator())

[b'hello world',
 b'how are you',
 b'bingo juice',
 b'illegal aliens',
 b'build the wall',
 b'build the dome']

In [16]:
filePath = Dataset.from_tensor_slices(['textFiles/text1.txt','textFiles/text2.txt','textFiles/text3.txt'])
interleaved = filePath.interleave(lambda x : TextLineDataset(x),cycle_length=3)
# instead of processing each textfile sequentially if we just pass in a list of file paths to
# TextLineDataset, interleave processes concurrently.
# basically, interleave iterates over the samples of the dataset on which it's called. 
# then for each sample it calls the mappint function we provided and then proceed to the next.
# After cycle_length of samples are finished, it returns to the first one 
# (and in each iteration it goes through cycle_length number of samples)
print(interleaved.element_spec)
print(list(interleaved.as_numpy_iterator()))

TensorSpec(shape=(), dtype=tf.string, name=None)
[b'hello world', b'bingo juice', b'build the wall', b'how are you', b'illegal aliens', b'build the dome']


In [21]:
dataset4 = Dataset.from_tensors([1,2,3])
print(list(dataset4.as_numpy_iterator()))
# from_tensors always treats whatever passed in as a single sample (it doesn't slice it).

[array([1, 2, 3], dtype=int32)]


In [87]:
model3 = models.clone_model(model2)
model3.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['categorical_accuracy'])

In [121]:
mnist_train_dataset = Dataset.from_tensor_slices((mnist_train_data.astype('float32'),mnist_train_labels))
print(mnist_train_dataset.element_spec)

(TensorSpec(shape=(28, 28), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.uint8, name=None))


In [122]:
def map_to_categorical(image, label):
    return image/255.0, [(1 if i == label else 0) for i in range(10)]

In [123]:
mnist_train_dataset = mnist_train_dataset.filter(lambda image, label : label<5) 
# filter takes in a function which maps the dataset sample to true or false
# true: it will stay, false: it will be filtered out.
mnist_train_dataset = mnist_train_dataset.map(map_to_categorical)
# map function takes in an element/sample and spit out one (all tensors).
# so naturally, any eager operation on numpy is not gonna work 
# (we can't convert to numpy because it's not eagerly executed). 
# That is why we are not using utils.to_categorical above. 

In [124]:
mnist_val_dataset = mnist_train_dataset.skip(int(mnist_train_data.shape[0]*0.8*0.5))
mnist_train_dataset = mnist_train_dataset.take(int(mnist_train_data.shape[0]*0.8*0.5))
# keep in mind that dataset object is like a generator, 
# all the processing we define are not necessarily executed until it's actually called.

In [125]:
mnist_val_dataset = mnist_val_dataset.batch(64)
print(mnist_val_dataset.element_spec)
# if we don't drop_remainder the batch_size is not clearly defined, so it prints None
mnist_train_dataset = mnist_train_dataset.batch(64, drop_remainder=True)
# when drop_remainder is enabled, the remainders are dropped right here (you can't find it anymore)
print(mnist_train_dataset.element_spec)
mnist_train_dataset = mnist_train_dataset.shuffle(200)
# what it means is that in choosing each batch of 128, 
# it chooses it from a shuffled buffer of 1000 data samples.
# Again, shuffling occurs on the fly; we don't actually shuffle things here (reduce memory).

(TensorSpec(shape=(None, 28, 28), dtype=tf.float32, name=None), TensorSpec(shape=(None, 10), dtype=tf.int32, name=None))
(TensorSpec(shape=(64, 28, 28), dtype=tf.float32, name=None), TensorSpec(shape=(64, 10), dtype=tf.int32, name=None))


In [126]:
model3.fit(mnist_train_dataset,validation_data=mnist_val_dataset, epochs=5)
# if the shuffle buffer size is too large, there may be some overhead. 
# instead of specifying a epochs, we could also do mnist_train_dataset.repeat(50)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fe617918a90>

In [128]:
mnist_train_dataset = mnist_train_dataset.repeat(5)
# if the number of times it repeats is not specified, it repeats indefinitely

In [129]:
model3.fit(mnist_train_dataset, validation_data=mnist_val_dataset)



<tensorflow.python.keras.callbacks.History at 0x7fe63315eb90>

In [133]:
import tensorflow_datasets as tfds

In [134]:
kmnist_data = tfds.load('kmnist',split=None)
# tfds.list_builders() lists all available datasets
# split=None means the train and test data will be in separated in a dictionary
# of course the method loads in tensorflow dataset directly
kmnist_train_dataset = kmnist_data['train']
kmnist_test_dataset = kmnist_data['test']

0:00<?, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:02,  1.05 url/s]
Dl Size...:   0%|          | 0/19 [00:00<?, ? MiB/s][A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:02,  1.05 url/s]
Dl Size...:   0%|          | 0/19 [00:00<?, ? MiB/s][A

Extraction completed...:   0%|          | 0/1 [00:00<?, ? file/s][A[A

Dl Completed...:  25%|██▌       | 1/4 [00:00<00:02,  1.05 url/s]
Dl Size...:   0%|          | 0/19 [00:00<?, ? MiB/s][A

Dl Completed...:  50%|█████     | 2/4 [00:01<00:01,  1.42 url/s]
Dl Size...:   0%|          | 0/19 [00:01<?, ? MiB/s][A

Dl Completed...:  50%|█████     | 2/4 [00:01<00:01,  1.42 url/s]
Dl Size...:   0%|          | 0/19 [00:01<?, ? MiB/s][A

Extraction completed...:  50%|█████     | 1/2 [00:01<00:00,  1.01 file/s][A[A

Dl Completed...:  50%|█████     | 2/4 [00:01<00:01,  1.42 url/s]
Dl Size...:   0%|          | 0/19 [00:01<?, ? MiB/s][A

Extraction completed...: 100%|██████████| 2/2 [00:01<00:00,  1.38 file/s][A[A
Dl Completed.