## Imports

In [8]:
import numpy as np
import pandas as pd

import string
from bs4 import BeautifulSoup
import re 
import seaborn as sns

from tqdm.notebook import tqdm
%load_ext autoreload
%autoreload 2

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import multiprocessing


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
print("Num GPUs Available: ", (tf.config.list_physical_devices('GPU')))

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [10]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [11]:
from transformers import AutoTokenizer

In [12]:
from nltk.corpus import stopwords

## Load DataSet

In [13]:
df = pd.read_csv("../../data/sample_10k_data_for_visualization.csv")
df

Unnamed: 0,abstract,article,section_names
0,b' we use the integral - field spectrograph sa...,b'according to the current galaxy formation pa...,b'introduction\nsummary and conclusion\nacknow...
1,"b"" we report on an effort to study the connect...","b""the interaction between high power plasma je...",b'introduction\ncomputational methods\nmodel d...
2,b' we consider the scenario of a magnetic fiel...,b'the interstellar medium ( ism ) exhibits str...,b'introduction\nfront structure\nphase transit...
3,b' we present a model - independent analysis o...,b'accreting stellar - mass black holes in bina...,b'introduction\nobservations and timing analys...
4,b' we evaluate the feasibility of the implemen...,b'one of the most prominent applications in qu...,b'introduction\nquantum repeater protocol opti...
...,...,...,...
9995,b' clump clusters and chain galaxies in the hu...,"b""galaxies observed with the advanced camera f...",b'introduction\ndata on udf bulges and clumps\...
9996,b' recent advances in early detection and deta...,"b""within the fireball model for gamma - ray bu...",b'introduction\nthe light curve and polarizati...
9997,b' the @xmath0 and @xmath1 meson production in...,b'the alice experiment @xcite scientific progr...,b'introduction\nresults\nconclusions\nreferences'
9998,b' recently there have been experimental resul...,b'the wave nature of light which explains the ...,b'introduction\nbabinet principle: poisson spo...


## Cleaning

In [14]:
df.shape

(10000, 3)

In [15]:
stop_words = set(stopwords.words('english')) 
punc = string.punctuation

df["article"] = df["article"].apply(eval)
df["abstract"] = df["abstract"].apply(eval)

In [16]:
def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    tokens = [w for w in newString.split() if not w in stop_words]
    tokens = [char for char in tokens if not char in punc]
    
    return (" ".join(tokens)).strip()

In [17]:
text_cleaner("dark matter haloes e.g. and but happy dog and cat, <h1>")

'dark matter haloes e.g. happy dog cat,'

In [18]:
df["abstract_clean"] = df["abstract"].apply(text_cleaner)

  newString = BeautifulSoup(newString, "lxml").text


In [19]:
df["article_clean"] = df["article"].apply(text_cleaner)

  newString = BeautifulSoup(newString, "lxml").text


## Define Tokenizer

In [20]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

max_article_length = 2048
max_abstract_length = 257

In [21]:
def preprocess_function(examples):
    examples = examples[1]
    model_inputs = tokenizer(examples["article_clean"], max_length=max_article_length, 
                             padding= "max_length", truncation=True, return_attention_mask= False, return_token_type_ids= False,
                             return_tensors= "tf")
    
    labels = tokenizer(text_target=examples["abstract_clean"], max_length=max_abstract_length, 
                       padding= "max_length", truncation=True, return_tensors="tf")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Test tokenizer

In [22]:
tokenized_articles = [preprocess_function(row) for row in tqdm(df.iterrows())]

# Multiprocess this


# pool = multiprocessing.Pool()
# result = list(tqdm(pool.map(preprocess_function, df.loc[:200].iterrows()),total_length = df.shape[0]))
# pool.close()
# pool.join()

0it [00:00, ?it/s]

In [23]:
tokenized_articles[1]["input_ids"].numpy().reshape(max_article_length,)

array([    0,  8007, 10845, ..., 15864,   216,     2], dtype=int32)

In [24]:
tokenized_articles[0]["labels"].shape[1]

257

### Make batches

In [28]:
BATCH_SIZE = 2
SHUFFLE_BUFFER_SIZE = 100

In [29]:
articles_tensors = np.array([data['input_ids'].numpy().reshape(max_article_length,) for data in tokenized_articles])
abstract_tensors = np.array([data['labels'].numpy().reshape(max_abstract_length,) for data in tokenized_articles])

In [30]:
# decoder_input

In [None]:
# decoder_output

## Save numpy arrays

In [31]:
with open("../../data/articles_numpy_short.npy","wb") as f:
    np.save(f, articles_tensors)

In [32]:
with open("../../data/abstract_numpy_short.npy","wb") as f:
    np.save(f, abstract_tensors)

## Load numpy arrays

In [33]:
with open("../../data/articles_numpy_short.npy", "rb") as f:
    articles_tensors = np.load(f)

In [34]:
with open("../../data/abstract_numpy_short.npy", "rb") as f:
    abstract_tensors = np.load(f)

In [35]:
decoder_input = abstract_tensors[:,:-1]
decoder_output = abstract_tensors[:, 1:]

In [36]:
dataset = tf.data.Dataset.from_tensor_slices(((articles_tensors, decoder_input),decoder_output))
dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)

In [37]:
dataset

<ShuffleDataset element_spec=((TensorSpec(shape=(2048,), dtype=tf.int32, name=None), TensorSpec(shape=(256,), dtype=tf.int32, name=None)), TensorSpec(shape=(256,), dtype=tf.int32, name=None))>

In [38]:
test_dataset = dataset.take(1000) 
train_dataset = dataset.skip(1000)

In [39]:
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [40]:
tokenizer

LEDTokenizerFast(name_or_path='allenai/led-base-16384', vocab_size=50265, model_max_length=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

## Modeling

In [41]:
from keras import backend as K
K.clear_session() 

latent_dim= 32
vocab_size = tokenizer.vocab_size - 30000

#### Encoder

In [42]:
inputs = tf.keras.layers.Input(shape=(max_article_length,))
input_embedding = tf.keras.layers.Embedding(vocab_size, latent_dim, trainable=True)(inputs) 

encoder_lstm1 = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
# encoder_output , state_h, state_c = encoder_lstm1(input_embedding) 

encoder_output1 , state_h1, state_c1 = encoder_lstm1(input_embedding) 

encoder_lstm2 = tf.keras.layers.LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

encoder_lstm3= tf.keras.layers.LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2)


#### Decoder

In [43]:
decoder_inputs = tf.keras.layers.Input(shape=(None,)) 
dec_emb_layer = tf.keras.layers.Embedding(vocab_size, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c]) 

decoder_dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

In [44]:
model = tf.keras.Model([inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 2048)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 2048, 32)     648480      ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 2048, 32),   8320        ['embedding[0][0]']              
                                 (None, 32),                                                      
                                 (None, 32)]                                                      
                                                                                              

In [45]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [46]:
for (i,j), k in train_dataset.take(1):
    print(i)
    print(j)
    print(k)

tf.Tensor(
[[    0 38696   107 ...   645 40206     2]
 [    0   571  8645 ...  2301  6364     2]], shape=(2, 2048), dtype=int32)
tf.Tensor(
[[    0 34996  1683  9624  1825   251  1186 23140  1743   634 16022  5638
  44650    16   154  1421   787  1178 40051   288    12  9773 11324  9624
   5447   403   787  1178 40051   134  5167  4359 41071  1122  1433  8069
   3092   924   684  1575 34983 11221  4086 45615  5177 47696  3569  1177
  26660  1634  3291  1246  2430  2167  2859  6787   976  4359 41071   787
   1178 40051   176   959 45615  4359 41071  2198   430  5177 47696  3569
     65  5181  4784  3094  5177 47696  3569 17369 21085  4634 16022  5638
  44650 40206 44650 17369  8254   976  9164  4359 41071  4777  5181   882
   2707 42694  1402   923  6731   882  2707   233 16022  5638 44650  4359
   2430  2167  2859   739  3266  6731   882  2707  3335 31858 24414  3187
   4359  3868 11493   787  1178 40051   176 41353   787  1178 40051   176
  19220  3056  1663 16934  4292  4737  3891  5

In [47]:
# train_ds, val_ds = tf.keras.utils.split_dataset(
#     train_dataset, left_size=0.9, right_size=0.1, shuffle=True, seed=42
# )

In [48]:
# for i, j in train_ds.take(1):
#     print(i.numpy().shape)
#     print(j.numpy().shape)

In [1]:
history = model.fit(train_dataset, epochs=50, callbacks=[es])

NameError: name 'model' is not defined

In [46]:
sub_train = [i,j]

In [43]:
for i, j in train_ds.take(1).as_numpy_iterator():
    print(i)
    print(j)

[[    0 25421 34934 ...     1     1     1]
 [    0 42853   910 ...     1     1     1]
 [    0 25870   803 ...     1     1     1]
 ...
 [    0  7822 16546 ...     1     1     1]
 [    0 35248 21861 ...     1     1     1]
 [    0  5079  6533 ...     1     1     1]]
[[    0 23375  2787 ...     1     1     1]
 [    0  3530    12 ...     1     1     1]
 [    0    29 43089 ...     1     1     1]
 ...
 [    0 41045 29515 ...     1     1     1]
 [    0 12406 24343 ...     1     1     1]
 [    0 38375 11747 ...  2397 15180     2]]


In [38]:
i.shape, j.shape

((32, 8192), (32, 512))

In [41]:
tokenizer.convert_ids_to_tokens(i[0]), 
tokenizej[0]

(['<s>',
  'capacity',
  'Ġcharacterization',
  'Ġgeneral',
  'Ġwireless',
  'Ġrelay',
  'Ġnetworks',
  'Ġfundamental',
  'Ġproblem',
  'Ġnetwork',
  'Ġinformation',
  'Ġtheory',
  'Ġhowever',
  'Ġcapacity',
  'Ġfully',
  'Ġcharacterized',
  'Ġeven',
  'Ġsimplest',
  'Ġnetwork',
  'Ġconsisting',
  'Ġsingle',
  'Ġsource',
  'Ġsingle',
  'Ġrelay',
  'Ġsingle',
  'Ġdestination',
  'Ġ@',
  'xc',
  'ite',
  'Ġwireless',
  'Ġenvironments',
  'Ġtransmit',
  'Ġsignal',
  'Ġheard',
  'Ġmultiple',
  'Ġnodes',
  'Ġcall',
  'Ġbroadcast',
  'Ġnature',
  'Ġwireless',
  'Ġcommunication',
  'Ġreceiver',
  'Ġreceive',
  'Ġsuper',
  'position',
  'Ġsimultaneously',
  'Ġtransmitted',
  'Ġsignals',
  'Ġmultiple',
  'Ġnodes',
  'Ġcall',
  'Ġinterference',
  'Ġnature',
  'Ġwireless',
  'Ġcommunication',
  'Ġfurthermore',
  'Ġwireless',
  'Ġchannels',
  'Ġmay',
  'Ġtime',
  'Ġvarying',
  'Ġdue',
  'Ġfading',
  'Ġnoise',
  'Ġreceiver',
  'Ġconsidering',
  'Ġmakes',
  'Ġproblem',
  'Ġvary',
  'Ġhard',
  'Ġhenc

In [87]:
history = model.fit([i,j], j, epochs=2, batch_size=BATCH_SIZE, callbacks=[es])

Epoch 1/2


InvalidArgumentError: Graph execution error:

Detected at node 'gradient_tape/sparse_categorical_crossentropy/clip_by_value/zeros_like' defined at (most recent call last):
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/runpy.py", line 197, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/runpy.py", line 87, in _run_code
      exec(code, run_globals)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 725, in start
      self.io_loop.start()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/asyncio/base_events.py", line 601, in run_forever
      self._run_once()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/asyncio/base_events.py", line 1905, in _run_once
      handle._run()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 409, in dispatch_shell
      await result
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/ipykernel/zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_58/2194197506.py", line 1, in <module>
      history = model.fit([i,j], j, epochs=2, batch_size=BATCH_SIZE, callbacks=[es])
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1650, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function
      return step_function(self, iterator)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step
      outputs = model.train_step(data)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1027, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 526, in minimize
      grads_and_vars = self.compute_gradients(loss, var_list, tape)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/optimizers/optimizer_experimental/optimizer.py", line 259, in compute_gradients
      grads = tape.gradient(loss, var_list)
Node: 'gradient_tape/sparse_categorical_crossentropy/clip_by_value/zeros_like'
Cannot parse tensor from proto: dtype: DT_FLOAT
tensor_shape {
  dim {
    size: 32
  }
  dim {
    size: 512
  }
  dim {
    size: 50265
  }
}

	 [[{{node gradient_tape/sparse_categorical_crossentropy/clip_by_value/zeros_like}}]] [Op:__inference_train_function_100578]