In [1]:
import os
from tensorflow.keras.utils import text_dataset_from_directory
import codecs
import tempfile
import json
import pandas as pd
import numpy as np

from keras.layers import Input, Dense, Activation, TimeDistributed, Softmax, TextVectorization, Reshape, RepeatVector, Conv1D, Bidirectional, AveragePooling1D, UpSampling1D, Embedding, Concatenate, GlobalAveragePooling1D, LSTM, Multiply, MultiHeadAttention
from keras.models import Model
import tensorflow as tf
import keras

from tensorflow.keras.callbacks import EarlyStopping

In [2]:
DATASET_DIR = r'./WebscrapData/'

In [3]:
# Create an empty list to store the content data
content_list = []

# Iterate over the folders in the root directory
for folder_name in os.listdir(DATASET_DIR):
    folder_path = os.path.join(DATASET_DIR, folder_name)
    
    # Check if the item in the root directory is a folder
    if os.path.isdir(folder_path):
        # Iterate over the JSON files in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                file_path = os.path.join(folder_path, filename)
                
                # Read the JSON file
                with open(file_path, 'r') as file:
                    data = json.load(file)
                
                # Access the content component or any other data within the JSON
                content = data['content']  # Replace 'content' with the actual key in your JSON
                
                # Append the content to the list
                content_list.append(content)

# Create a DataFrame from the content list
df = pd.DataFrame({'content': content_list})

In [4]:
df

Unnamed: 0,content
0,Because we can’t truly advance groundbreaking ...
1,"December 20, 2022 Summarization using automat..."
2,"April 13, 2023 From a young age, people expre..."
3,"December 13, 2022 Many recent breakthroughs i..."
4,"April 17, 2023 Ocorreu um erroEstamos tendo pr..."
...,...
544,Adam Geitgey Follow -- 72 Listen Share Update:...
545,Adam Geitgey Follow -- 27 Listen Share Update:...
546,Adam Geitgey Follow -- 18 Listen Share This ar...
547,Adam Geitgey Follow -- 263 Listen Share Update...


In [5]:
dataset = tf.data.Dataset.from_tensor_slices(df['content'].values)

# Batch the dataset
batch_size = 16
dataset = dataset.batch(batch_size)

In [6]:
from keras.layers import Input, TextVectorization
from keras.models import Model
vocab_size = 1000
seq_len = 10
vectorize_layer = TextVectorization(max_tokens=vocab_size, output_sequence_length=seq_len)
vectorize_layer.adapt(dataset)

In [7]:
def predict_word(seq_len, latent_dim, vocab_size):

    # define imput layer
    input_layer = Input(shape=(seq_len-1,))
    x = input_layer

    # add embedding layer
    x = Embedding(vocab_size, latent_dim, name='embedding', mask_zero=True)(x)

    # apply attention
    x = MultiHeadAttention(num_heads=3, key_dim=2)(x, value=x)

    # apply bidirectional LSTM
    x1 = LSTM(1)(x)
    x2 = LSTM(1, go_backwards=True)(x)

    # concatenate LSTM outputs
    x = Concatenate()([x1, x2])
    latent_rep = x

    # add final dense and softmax layers
    x = Dense(vocab_size)(x)
    x = Softmax()(x)

    # create and return model
    return Model(input_layer, x), Model(input_layer, latent_rep)

# create predictor and latent model
predictor, latent = predict_word(10, 32, vocab_size)

# print model summary
predictor.summary()

# configure optimizer and loss function
#opt = keras.optimizers.SGD(learning_rate=1, momentum=0.9)
opt = keras.optimizers.Nadam(learning_rate=0.1)
loss_fn = keras.losses.SparseCategoricalCrossentropy(
    ignore_class=1,
    name="sparse_categorical_crossentropy",
)

#compile the model
predictor.compile(loss=loss_fn, optimizer=opt, metrics=["accuracy"])

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 9)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 9, 32)        32000       ['input_1[0][0]']                
                                                                                                  
 multi_head_attention (MultiHea  (None, 9, 32)       818         ['embedding[0][0]',              
 dAttention)                                                      'embedding[0][0]']              
                                                                                                  
 lstm (LSTM)                    (None, 1)            136         ['multi_head_attention[0][0]'

In [8]:
def separar_ultimo_token(x):
    x_ = vectorize_layer(x)
    x_ = x_[:,:-1]
    y_ = x_[:,-1:]
    return x_, y_

In [9]:
dataset.map(separar_ultimo_token)

<_MapDataset element_spec=(TensorSpec(shape=(None, 9), dtype=tf.int64, name=None), TensorSpec(shape=(None, 1), dtype=tf.int64, name=None))>

In [10]:
# Define the EarlyStopping callback
# Configure early stopping callback
early_stopping = EarlyStopping(patience=5, restore_best_weights=True, monitor='loss')

In [11]:
history = predictor.fit(dataset.map(separar_ultimo_token), epochs=60, verbose=1, callbacks=[early_stopping])

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


In [12]:
def predizer2(entrada, numero_de_predicoes, modelo, vectorize_layer, temperature=0):
    frase = entrada
    contexto = frase # Contexto deslizante
    temperature = temperature

    for n in range(numero_de_predicoes):
        pred = modelo.predict(vectorize_layer([contexto])[:,:-1])

        # Nao repetir palavras
        tentando = True
        while tentando:

            # Selectionar de k-best
            candidatos = tf.math.top_k(pred, k=10).indices[0,:]
            idx = np.random.choice(candidatos.numpy())
            # idx = tf.argmax(pred, axis=1)[0]
            word = vectorize_layer.get_vocabulary()[idx]
            if word in frase.split():
                pred[0][idx] = 0
            else:
                tentando = False
                
        frase = frase + " " + word
        contexto = contexto + " " + word
        #print(frase)
        contexto = ' '.join(frase.split()[1:])
        print(word)
    return frase

predizer2("machine learning is", 10, predictor, vectorize_layer)

tools
article
for
models
with
life
using
about
so
in


'machine learning is tools article for models with life using about so in'