<a href="https://colab.research.google.com/github/martinpius/TensorFlow_for_Nested_Variables_List/blob/main/Tensorflow_for_unstructed_and_nested_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
try:
  drive.mount("/content/drive", force_remount = True)
  COLAB = True
  import tensorflow as tf
  print(f"You are using google colab with tensorflow version: {tf.__version__}")
except Exception as e:
  COLAB = False
  print(f"{type(e)}: {e}\n....Load your drive....")

def time_fmt(t):
  h = int(t/ (60 * 60))
  m = int(t % (60 * 60)/ 60)
  s = int(t % 60)
  return f"{h}:{m:>03}:{s:>05.2f}"

Mounted at /content/drive
You are using google colab with tensorflow version: 2.3.0


In [3]:
import tensorflow as tf
import numpy as np
import time
from datetime import datetime


In [4]:
#Real life applications uses a lot of complex data structures which comes in terms of
#unstructed data such as audio (sound waves), videos strams, texts and many more
#These data can be manipulated directly with tensorflow without changing the dimension or padding.
#This is handled by using ragged tensors


In [5]:
#Consider the follwing examples
v1 = tf.ragged.constant([[1,2,3],[3,4],[8],[],[9,10,12,44]], dtype= tf.float32)
v2 =tf.ragged.constant([['Ragged tensor'],['can handle various'],['data'],['structure and use similar ops to regular tensors']])

In [6]:
display(v1)

<tf.RaggedTensor [[1.0, 2.0, 3.0], [3.0, 4.0], [8.0], [], [9.0, 10.0, 12.0, 44.0]]>

In [7]:
display(v2)

<tf.RaggedTensor [[b'Ragged tensor'], [b'can handle various'], [b'data'], [b'structure and use similar ops to regular tensors']]>

In [8]:
#We can work with common tensorflow operations on ragged tensors
print(tf.concat([v1, [[0.]]], axis = 0))#Concatenation of tensors

<tf.RaggedTensor [[1.0, 2.0, 3.0], [3.0, 4.0], [8.0], [], [9.0, 10.0, 12.0, 44.0], [0.0]]>


In [9]:
print(tf.reduce_mean(v1, axis = 1))

tf.Tensor([ 2.    3.5   8.     nan 18.75], shape=(5,), dtype=float32)


In [10]:
print(tf.reduce_sum(v1, axis = 0))

tf.Tensor([21. 16. 15. 44.], shape=(4,), dtype=float32)


In [11]:
print(tf.strings.substr(v2, 0,2))

<tf.RaggedTensor [[b'Ra'], [b'ca'], [b'da'], [b'st']]>


In [12]:
display(tf.map_fn(tf.nn.softmax, v1))

<tf.RaggedTensor [[0.09003057330846786, 0.2447284758090973, 0.6652409434318542], [0.2689414322376251, 0.7310585975646973], [1.0], [], [6.305116853596367e-16, 1.7139083824518615e-15, 1.2664165777252073e-14, 1.0]]>

In [13]:
display(v1[:,:-2])

<tf.RaggedTensor [[1.0], [], [], [], [9.0, 10.0]]>

In [14]:
display(v1[:,:2])

<tf.RaggedTensor [[1.0, 2.0], [3.0, 4.0], [8.0], [], [9.0, 10.0]]>

In [15]:
display(v1 + 10)#Broadcasting in raggged tensor dont add affect the empty slot

<tf.RaggedTensor [[11.0, 12.0, 13.0], [13.0, 14.0], [18.0], [], [19.0, 20.0, 22.0, 54.0]]>

In [16]:
#`elementwise transformation can also achieved by applying a function to a map_flat_values method as follow
tf.ragged.map_flat_values(lambda x: x**2-1, v1)

<tf.RaggedTensor [[0.0, 3.0, 8.0], [8.0, 15.0], [63.0], [], [80.0, 99.0, 143.0, 1935.0]]>

In [17]:
tf.map_fn(lambda x: x**2-1, v1) #We can also use the map function directly to get the same results

<tf.RaggedTensor [[0.0, 3.0, 8.0], [8.0, 15.0], [63.0], [], [80.0, 99.0, 143.0, 1935.0]]>

In [18]:
#Ragged tensor and keras API.
#We can declare our inputs as into keras model as a ragged tensor.


In [19]:
#Consider the following example of predicting list of pure fruits

In [20]:
fruits_bag = ['apple orange lemon',
              'rice chicken',
              'banana soup pineapple pork'
              ,'cloth water berries',
              'orange','jackfruit mango tomato']

In [21]:
fruits = tf.constant(fruits_bag)

In [22]:
label = tf.constant([True, False, False,False, True, True])

In [23]:
#Create the suitable data format for training a simple LSTM
#No padding is required since we applie a ragged tensor
corpous_size = 1024
words = tf.strings.split(fruits, ' ')

In [24]:
indices = tf.strings.to_hash_bucket_fast(words, corpous_size) #Getting the vocubulary indices for each word in a corpous

In [25]:
indices

<tf.RaggedTensor [[793, 253, 226], [127, 617], [36, 491, 194, 862], [256, 521, 53], [253], [706, 346, 693]]>

In [26]:
#We can create a now build a simple LSTM model with an embedding layer to classify if the list is pure fruit or not

In [27]:
mymodel = tf.keras.Sequential()

In [28]:
mymodel.add(tf.keras.layers.Input(shape = [None], dtype= tf.float64,ragged=True))
mymodel.add(tf.keras.layers.Embedding(input_dim=corpous_size, output_dim= 16))
mymodel.add(tf.keras.layers.LSTM(units = 32, recurrent_dropout= 0.2, dropout=0.2, use_bias= False))
mymodel.add(tf.keras.layers.Dense(units = 32))
mymodel.add(tf.keras.layers.Activation(tf.nn.relu))
mymodel.add(tf.keras.layers.Dense(units = 1))


In [29]:
mymodel.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          16384     
_________________________________________________________________
lstm (LSTM)                  (None, 32)                6144      
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
activation (Activation)      (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 23,617
Trainable params: 23,617
Non-trainable params: 0
_________________________________________________________________


In [30]:
mymodel.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics= ['accuracy'])

In [31]:
mymodel.fit(x = indices, y = label, epochs = 5, verbose= 2)

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


1/1 - 0s - loss: 7.7138 - accuracy: 0.5000
Epoch 2/5
1/1 - 0s - loss: 7.7125 - accuracy: 0.5000
Epoch 3/5
1/1 - 0s - loss: 7.7125 - accuracy: 0.5000
Epoch 4/5
1/1 - 0s - loss: 7.7125 - accuracy: 0.5000
Epoch 5/5
1/1 - 0s - loss: 7.7125 - accuracy: 0.5000


<tensorflow.python.keras.callbacks.History at 0x7f2426e39470>

In [32]:
mymodel.predict(indices)

array([[-0.01687861],
       [-0.01295061],
       [-0.01507597],
       [-0.01110675],
       [-0.01405909],
       [-0.01228353]], dtype=float32)

In [33]:
#Tensorflow.example (for variables - length features) with serialized strings using google prototype buffer encoding
##Data encoded with tf.example
#Consider a list of tf data encoded as tf.example
import google.protobuf.text_format as gptf

In [34]:
def make_tf_Edata(f):
  dfm = gptf.Merge(f, tf.train.Example()).SerializeToString()# To return the merged data as a series of strings
  return dfm

In [35]:
#Construct the list of feature's specification dictionary to use it in parsing to generate dictionary mapping feature's name to tensors
data = [
        make_tf_Edata(r'''
        features{
          feature {key: 'shape' value {bytes_list {value: ['oval','triangle','iregular']}}}
          feature {key: 'size' value {int64_list {value:[23,18]}}}
        }'''),
       
        make_tf_Edata(r'''
        features{
          feature{key: 'shape' value {bytes_list {value:['rectangle']}}}
          feature{key: 'zize' value {int64_list {value: []}}}}'''),
        
        make_tf_Edata(r'''
        features {
          feature { key: 'shape' value {bytes_list {value: ['oval','square','spherical']}}}
          feature {key: 'size' value {int64_list {value:[16,10,14]}}}}''')
]

In [36]:
#Parse the serilized data using tf.io.parse_example
#Note that our features length are not uniform, so we apply ragged tensor to be specific
dict_ragged = {
    'shape': tf.io.RaggedFeature(tf.string),
    'size': tf.io.RaggedFeature(tf.int64)
}

In [37]:
#Parsing 
mytensors_features = tf.io.parse_example(data, dict_ragged)
for name, value in mytensors_features.items():
  print("{}={}".format(name, value))

shape=<tf.RaggedTensor [[b'oval', b'triangle', b'iregular'], [b'rectangle'], [b'oval', b'square', b'spherical']]>
size=<tf.RaggedTensor [[23, 18], [], [16, 10, 14]]>


In [38]:
#Building complex datasets (ragged tf-data) using tf.data

In [39]:
#A fn to print a dictionary dataset in a nice format
def fn_dict_reader(dfm):
  for i, element in enumerate(dfm):
    print(f"Element: {i}")
    for (j, k) in element.items():
      print(f"{j:>14}, {k}")

In [40]:
#Build tf.dataset from the above defined ragged tensor dictionary
dfm_ragged = tf.data.Dataset.from_tensor_slices(mytensors_features)

In [41]:
#Read the data using the above defined reader

In [42]:
fn_dict_reader(dfm_ragged)

Element: 0
         shape, [b'oval' b'triangle' b'iregular']
          size, [23 18]
Element: 1
         shape, [b'rectangle']
          size, []
Element: 2
         shape, [b'oval' b'square' b'spherical']
          size, [16 10 14]


In [43]:
#We can also combine some elements into batches and create a batched ragged tf.data

In [44]:
batch_dfm_raged = dfm_ragged.batch(2) #Will deliver only 2 groups 

In [45]:
#Read using the defined dict_reader above
fn_dict_reader(batch_dfm_raged)

Element: 0
         shape, <tf.RaggedTensor [[b'oval', b'triangle', b'iregular'], [b'rectangle']]>
          size, <tf.RaggedTensor [[23, 18], []]>
Element: 1
         shape, <tf.RaggedTensor [[b'oval', b'square', b'spherical']]>
          size, <tf.RaggedTensor [[16, 10, 14]]>


In [46]:
#Unbatch will flatten the data back to its original shape
dfm_flatten = batch_dfm_raged.unbatch()


In [47]:
#Read using the above defined dictionary reader
fn_dict_reader(dfm_flatten)

Element: 0
         shape, [b'oval' b'triangle' b'iregular']
          size, [23 18]
Element: 1
         shape, [b'rectangle']
          size, []
Element: 2
         shape, [b'oval' b'square' b'spherical']
          size, [16 10 14]


In [48]:
#Using tf.function() to decorate ragged tensors:
#Consider the following example
@tf.function
def fn_comp(a,axis):
  res = tf.concat([a, tf.reverse(a,[axis])], axis)
  return res

In [49]:
#Calling the fn
con_tensor = fn_comp(tf.constant([[1,2,3],[4,5,6],[4,2,3]],dtype=tf.float32),axis = 1)

In [50]:
display(con_tensor)

<tf.Tensor: shape=(3, 6), dtype=float32, numpy=
array([[1., 2., 3., 3., 2., 1.],
       [4., 5., 6., 6., 5., 4.],
       [4., 2., 3., 3., 2., 4.]], dtype=float32)>

In [51]:
fn_comp(tf.ragged.constant([[1],[2,3,4],[1,2,3,4,5,6],[3,4,]], dtype = tf.int32), axis = 1)

<tf.RaggedTensor [[1, 1], [2, 3, 4, 4, 3, 2], [1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2, 1], [3, 4, 4, 3]]>

In [59]:
#High dimension ragged tensors
t3d_ragged = tf.ragged.constant([[[1,2]],[[2,3,4,5]],[[1]],[[2,1,3,4,5,6]],[[3,5,6,7]]])

In [60]:
display(t3d_ragged)

<tf.RaggedTensor [[[1, 2]], [[2, 3, 4, 5]], [[1]], [[2, 1, 3, 4, 5, 6]], [[3, 5, 6, 7]]]>

In [63]:
display(t3d_ragged[1]) # second element in each row

<tf.RaggedTensor [[2, 3, 4, 5]]>

In [65]:
display(t3d_ragged[:,:1])

<tf.RaggedTensor [[[1, 2]], [[2, 3, 4, 5]], [[1]], [[2, 1, 3, 4, 5, 6]], [[3, 5, 6, 7]]]>

In [71]:
display(tf.reduce_max(tf.map_fn(lambda x: x**3-1,t3d_ragged)))

<tf.Tensor: shape=(), dtype=int32, numpy=342>

In [69]:
tf.reduce_max(tf.constant([[2,2,3,4,1],[3,5,7,4,3]]))

<tf.Tensor: shape=(), dtype=int32, numpy=7>

In [70]:
tf.map_fn(lambda x: x**3, t3d_ragged)

<tf.RaggedTensor [[[1, 8]], [[8, 27, 64, 125]], [[1]], [[8, 1, 27, 64, 125, 216]], [[27, 125, 216, 343]]]>

In [102]:
#Conversion between ragged tensors and regular tensors vs sparse tensors.
#We can easily do conversion between the thensors type like as follow
my_ragged = tf.ragged.constant([[[1,2,3]],[[3]],[[3,4,5,2,3]],[[5,4]],[[3,2,4,5,6,7,1]]])

In [103]:
tf_regular = my_ragged.to_tensor(default_value = 0, shape = (5,1,10))

In [104]:
tf_regular

<tf.Tensor: shape=(5, 1, 10), dtype=int32, numpy=
array([[[1, 2, 3, 0, 0, 0, 0, 0, 0, 0]],

       [[3, 0, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[3, 4, 5, 2, 3, 0, 0, 0, 0, 0]],

       [[5, 4, 0, 0, 0, 0, 0, 0, 0, 0]],

       [[3, 2, 4, 5, 6, 7, 1, 0, 0, 0]]], dtype=int32)>

In [107]:
#To get back to the ragged tensor we can write:
try:
  tf.RaggedTensor.from_tensor(tf_regular, padding = tf.zeros(shape = tf_regular.shape[-1]))
except Exception as e:
  print(f"{type(e)}: {e}")

<class 'ValueError'>: Tensor conversion requested dtype int32 for Tensor with dtype float32: <tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>


In [99]:
d = tf.constant([[1,2,3,3,3,3],[1,4,9,3,3,3],[1,3,3,3,3,3],[2,6,7,8,9,3]])

In [100]:
rag_bac = tf.RaggedTensor.from_tensor(d, padding = 3)

In [101]:
rag_bac

<tf.RaggedTensor [[1, 2], [1, 4, 9], [1], [2, 6, 7, 8, 9]]>

In [109]:
print(rag_bac.to_sparse() #Get the sparse version

SparseTensor(indices=tf.Tensor(
[[0 0]
 [0 1]
 [1 0]
 [1 1]
 [1 2]
 [2 0]
 [3 0]
 [3 1]
 [3 2]
 [3 3]
 [3 4]], shape=(11, 2), dtype=int64), values=tf.Tensor([1 2 1 4 9 1 2 6 7 8 9], shape=(11,), dtype=int32), dense_shape=tf.Tensor([4 5], shape=(2,), dtype=int64))


In [110]:
print(rag_bac.to_list())

[[1, 2], [1, 4, 9], [1], [2, 6, 7, 8, 9]]
