## Imports

In [1]:
import numpy as np
import pandas as pd

import string
from bs4 import BeautifulSoup
import re 
import seaborn as sns

from tqdm.notebook import tqdm
%load_ext autoreload
%autoreload 2

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
import tensorflow as tf
import multiprocessing


In [2]:
print("Num GPUs Available: ", (tf.config.list_physical_devices('GPU')))

Num GPUs Available:  [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [4]:
from transformers import AutoTokenizer

In [5]:
from nltk.corpus import stopwords

## Load DataSet

In [6]:
df = pd.read_csv("../../data/sample_10k_data_for_visualization.csv")
df

Unnamed: 0,abstract,article,section_names
0,b' we use the integral - field spectrograph sa...,b'according to the current galaxy formation pa...,b'introduction\nsummary and conclusion\nacknow...
1,"b"" we report on an effort to study the connect...","b""the interaction between high power plasma je...",b'introduction\ncomputational methods\nmodel d...
2,b' we consider the scenario of a magnetic fiel...,b'the interstellar medium ( ism ) exhibits str...,b'introduction\nfront structure\nphase transit...
3,b' we present a model - independent analysis o...,b'accreting stellar - mass black holes in bina...,b'introduction\nobservations and timing analys...
4,b' we evaluate the feasibility of the implemen...,b'one of the most prominent applications in qu...,b'introduction\nquantum repeater protocol opti...
...,...,...,...
9995,b' clump clusters and chain galaxies in the hu...,"b""galaxies observed with the advanced camera f...",b'introduction\ndata on udf bulges and clumps\...
9996,b' recent advances in early detection and deta...,"b""within the fireball model for gamma - ray bu...",b'introduction\nthe light curve and polarizati...
9997,b' the @xmath0 and @xmath1 meson production in...,b'the alice experiment @xcite scientific progr...,b'introduction\nresults\nconclusions\nreferences'
9998,b' recently there have been experimental resul...,b'the wave nature of light which explains the ...,b'introduction\nbabinet principle: poisson spo...


## Cleaning

In [7]:
df.shape

(10000, 3)

In [8]:
stop_words = set(stopwords.words('english')) 
punc = string.punctuation

df["article"] = df["article"].apply(eval)
df["abstract"] = df["abstract"].apply(eval)

In [9]:
def text_cleaner(text):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    tokens = [w for w in newString.split() if not w in stop_words]
    tokens = [char for char in tokens if not char in punc]
    
    return (" ".join(tokens)).strip()

In [10]:
text_cleaner("dark matter haloes e.g. and but happy dog and cat, <h1>")

'dark matter haloes e.g. happy dog cat,'

In [11]:
df["abstract_clean"] = df["abstract"].apply(text_cleaner)

  newString = BeautifulSoup(newString, "lxml").text


In [12]:
df["article_clean"] = df["article"].apply(text_cleaner)

  newString = BeautifulSoup(newString, "lxml").text


## Define Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384")

max_article_length = 8192
max_abstract_length = 512

In [14]:
def preprocess_function(examples):
    examples = examples[1]
    model_inputs = tokenizer(examples["article_clean"], max_length=max_article_length, 
                             padding= "max_length", truncation=True, return_attention_mask= False, return_token_type_ids= False,
                             return_tensors= "tf")
    
    labels = tokenizer(text_target=examples["abstract_clean"], max_length=max_abstract_length, 
                       padding= "max_length", truncation=True, return_tensors="tf")
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

### Test tokenizer

In [15]:
tokenized_articles = [preprocess_function(row) for row in tqdm(df.iterrows())]

# Multiprocess this


# pool = multiprocessing.Pool()
# result = list(tqdm(pool.map(preprocess_function, df.loc[:200].iterrows()),total_length = df.shape[0]))
# pool.close()
# pool.join()

0it [00:00, ?it/s]

In [16]:
tokenized_articles[1]["input_ids"].numpy().reshape(8192,)

array([    0,  8007, 10845, ...,     1,     1,     1], dtype=int32)

In [17]:
tokenized_articles[0]["labels"].shape[1]

512

### Make batches

In [18]:
BATCH_SIZE = 32
SHUFFLE_BUFFER_SIZE = 100

In [19]:
articles_tensors = np.array([data['input_ids'].numpy().reshape(8192,) for data in tokenized_articles])
abstract_tensors = np.array([data['labels'].numpy().reshape(512,) for data in tokenized_articles])

In [20]:
dataset = tf.data.Dataset.from_tensor_slices((articles_tensors, abstract_tensors))
dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)

In [21]:
dataset

<ShuffleDataset element_spec=(TensorSpec(shape=(8192,), dtype=tf.int32, name=None), TensorSpec(shape=(512,), dtype=tf.int32, name=None))>

In [22]:
for i,j in dataset.take(1):
    print(i.numpy())
    print(j.numpy())

[    0  8007 10845 ...     1     1     1]
[    0  7415  1351   892  7070 11043 27361  3188 22703 29051  7964  3611
   786 12968 10417 35235  9883  2584  7964  2942    92 37920  3552 35235
  4240  2386   203  4271   356   936   678   941 11043  1950   130 43654
  7964  3489 16529 29472  3863  4900  2342  3760 26435 33842  4900 15168
 30970  6292 19157 25771  4761 17156   747  7677  2621   670  4900 17829
  4817  9159  1455  2297 15800  3092 49015 10632  1386   203  2632 45518
  4817  3748 12801  4620   583   253  4900  3315   444   540 16887  6184
 33100 19308 11401 17512  5447  7964  2200 46132   196   157 20231 11416
 39216  2368 33073  3315 35668  2632  6184 17796   611 12179  2839 10662
 34265  1879  5580 35235  9883   817  1202   304  3611 35235  8576 40182
   400   731 10662     2     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1     1     1     1     1     1     1     1     1     1     1     1
     1   

In [23]:
test_dataset = dataset.take(1000) 
train_dataset = dataset.skip(1000)

In [24]:
train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

In [25]:
tokenizer

LEDTokenizerFast(name_or_path='allenai/led-base-16384', vocab_size=50265, model_max_length=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)})

## Modeling

In [26]:
from keras import backend as K
K.clear_session() 

latent_dim= 512
vocab_size = tokenizer.vocab_size

#### Encoder

In [27]:
inputs = tf.keras.layers.Input(shape=(max_article_length,))
input_embedding = tf.keras.layers.Embedding(vocab_size, latent_dim, trainable=True)(inputs) 

encoder_lstm1 = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
encoder_output1 , state_h1, state_c1 = encoder_lstm1(input_embedding) 

encoder_lstm2 = tf.keras.layers.LSTM(latent_dim,return_sequences=True,return_state=True) 
encoder_output2, state_h2, state_c2 = encoder_lstm2(encoder_output1) 

#LSTM 3 
encoder_lstm3= tf.keras.layers.LSTM(latent_dim, return_state=True, return_sequences=True) 
encoder_outputs, state_h, state_c= encoder_lstm3(encoder_output2) 

#### Decoder

In [28]:
decoder_inputs = tf.keras.layers.Input(shape=(None,)) 
dec_emb_layer = tf.keras.layers.Embedding(vocab_size, latent_dim,trainable=True) 
dec_emb = dec_emb_layer(decoder_inputs) 

#LSTM using encoder_states as initial state
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True) 
decoder_outputs,decoder_fwd_state, decoder_back_state = decoder_lstm(dec_emb, initial_state=[state_h, state_c]) 

decoder_dense = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(decoder_outputs)

In [29]:
model = tf.keras.Model([inputs, decoder_inputs], decoder_outputs)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 8192)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 8192, 512)    25735680    ['input_1[0][0]']                
                                                                                                  
 lstm (LSTM)                    [(None, 8192, 512),  2099200     ['embedding[0][0]']              
                                 (None, 512),                                                     
                                 (None, 512)]                                                     
                                                                                              

In [30]:
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2)

In [31]:
train_ds, val_ds = tf.keras.utils.split_dataset(
    train_dataset, left_size=0.9, right_size=0.1, shuffle=True, seed=42
)

In [32]:
for i, j in train_ds.take(1):
    print(i.numpy().shape)
    print(j.numpy().shape)

(32, 8192)
(32, 512)


In [33]:
history = model.fit(train_ds, epochs=50, callbacks=[es], validation_data=val_ds)

Epoch 1/50


ValueError: in user code:

    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/rootroot/miniconda3/envs/tf/lib/python3.9/site-packages/keras/engine/input_spec.py", line 216, in assert_input_compatibility
        raise ValueError(

    ValueError: Layer "model" expects 2 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 8192) dtype=int32>]


In [34]:
type(train_ds)

tensorflow.python.data.ops.dataset_ops.PrefetchDataset