In [1]:
import numpy as np
import pandas as pd 
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Load the dataset from the local file
file_path = "/Users/owendolan/Desktop/nlp-email-client/data/article_summaries(small).csv"
data = pd.read_csv(file_path)

# Extract documents (Content) and summaries
articles = data["Content"].astype(str).tolist()
summaries = data["Summary"].astype(str).tolist()

In [None]:
# Hyperparameter
max_vocab_size = 2000
max_seq_length = 20

# Create and configure tokenizers with special tokens
input_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<unk>")
target_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<unk>")

# Add special tokens to target texts
target_texts_with_tokens = ['<start> ' + text + ' <end>' for text in summaries]

# Fit tokenizers
input_tokenizer.fit_on_texts(articles)
target_tokenizer.fit_on_texts(target_texts_with_tokens)

# Manually add special tokens if they're not in the vocabulary
special_tokens = ['<start>', '<end>', '<unk>']
current_index = len(target_tokenizer.word_index) + 1
for token in special_tokens:
    if token not in target_tokenizer.word_index:
        target_tokenizer.word_index[token] = current_index
        target_tokenizer.index_word[current_index] = token
        current_index += 1

# Tokenize texts
input_sequences = input_tokenizer.texts_to_sequences(articles)
target_sequences = target_tokenizer.texts_to_sequences(target_texts_with_tokens)

# Pad sequences
encoder_input_data = pad_sequences(input_sequences, maxlen=max_seq_length, padding='post')
decoder_input_data = pad_sequences(target_sequences, maxlen=max_seq_length, padding='post')

# Prepare decoder target data (shifted by one timestep)
decoder_target_data = np.zeros_like(decoder_input_data)
decoder_target_data[:, :-1] = decoder_input_data[:, 1:]

# Update max_vocab_size to account for special tokens
max_vocab_size = max(len(input_tokenizer.word_index) + 1, len(target_tokenizer.word_index) + 1)

In [4]:
# Model architecture
embedding_dim = 256
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(max_seq_length,))
encoder_embedding = Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = LSTM(lstm_units, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(max_seq_length,))
decoder_embedding = Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, mask_zero=True)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(max_vocab_size, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Combined Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

2024-11-15 16:25:59.240156: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2024-11-15 16:25:59.240186: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-11-15 16:25:59.240195: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-11-15 16:25:59.240215: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-11-15 16:25:59.240229: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# compile the model 

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [6]:
# train the model 

model.fit(
    [encoder_input_data, decoder_input_data],
    np.expand_dims(decoder_target_data, -1),
    batch_size=32,
    epochs=50,
    validation_split=0.2
)

Epoch 1/50


2024-11-15 16:26:00.596838: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 256ms/step - accuracy: 0.1735 - loss: 10.0380 - val_accuracy: 0.2175 - val_loss: 6.5481
Epoch 2/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 209ms/step - accuracy: 0.2112 - loss: 5.7711 - val_accuracy: 0.2271 - val_loss: 5.5520
Epoch 3/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.2198 - loss: 5.1458 - val_accuracy: 0.2271 - val_loss: 5.4617
Epoch 4/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 209ms/step - accuracy: 0.2196 - loss: 5.0522 - val_accuracy: 0.2271 - val_loss: 5.4066
Epoch 5/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.2201 - loss: 5.0029 - val_accuracy: 0.2271 - val_loss: 5.3706
Epoch 6/50
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 207ms/step - accuracy: 0.2187 - loss: 5.0015 - val_accuracy: 0.2271 - val_loss: 5.3333
Epoch 7/50
[1m20/20[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x15bc35150>

In [7]:
# Encoder Model
encoder_model = Model(encoder_inputs, encoder_states)

# Decoder Model
decoder_state_input_h = Input(shape=(lstm_units,))
decoder_state_input_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_lstm_outputs, state_h, state_c = decoder_lstm(
    decoder_embedding, initial_state=decoder_states_inputs
)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_lstm_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)


In [8]:
def decode_sequence(input_seq):
    # Encode the input sequence
    states_value = encoder_model.predict(input_seq)
    
    # Verify that '<start>' is in the vocabulary
    if '<start>' not in target_tokenizer.word_index:
        raise KeyError("'<start>' token not found in vocabulary!")
        
    # Generate empty target sequence with <start> token
    target_seq = np.zeros((1, max_seq_length))
    target_seq[0, 0] = target_tokenizer.word_index['<start>']
    
    stop_condition = False
    decoded_sentence = []
    iteration = 0  # for debugging loop iterations
    
    while not stop_condition:
        # Predict the next word
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        # Get the predicted word
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_word = target_tokenizer.index_word.get(sampled_token_index, '<unk>')
        
        # Debugging prints
        print(f"Iteration {iteration}: Predicted token index = {sampled_token_index}, word = '{sampled_word}'")
        
        # Append the word to the decoded sentence
        if sampled_word != '<end>':
            decoded_sentence.append(sampled_word)
        
        # Exit condition
        if sampled_word == '<end>' or len(decoded_sentence) >= max_seq_length:
            stop_condition = True
        
        # Update the target sequence (next input to the decoder)
        target_seq = np.zeros((1, max_seq_length))
        target_seq[0, 0] = sampled_token_index
        
        # Update states
        states_value = [h, c]
        
        iteration += 1  # increment iteration counter
        
    return ' '.join(decoded_sentence)


In [9]:
# Test the model
test_input = "There are a lot of airplanes in the sky all flying around. Many people sit in the airplanes. The airplanes fly very fast."
test_seq = pad_sequences(input_tokenizer.texts_to_sequences([test_input]), maxlen=max_seq_length, padding='post')

# Print vocabulary info for debugging
print("Target tokenizer vocabulary size:", len(target_tokenizer.word_index))
print("Special tokens in vocabulary:", {token: target_tokenizer.word_index.get(token, 'Not found') for token in ['<start>', '<end>', '<unk>']})

summary = decode_sequence(test_seq)
print("Generated Summary:", summary)

Target tokenizer vocabulary size: 9521
Special tokens in vocabulary: {'<start>': 9520, '<end>': 9521, '<unk>': 1}
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 721ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 438ms/step
Iteration 0: Predicted token index = 1, word = '<unk>'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Iteration 1: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Iteration 2: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Iteration 3: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Iteration 4: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
Iteration 5: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Iteration 6: Predicted token index = 8, word = 'end'
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Iteration 7: Predicted token index = 8, word = 'en