<a href="https://colab.research.google.com/github/lucascheng24/COMP4432ML-DataProduct-A_Million_News_Headlines/blob/main/implement/rNN_headlineGeneration_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
url = 'https://raw.githubusercontent.com/lucascheng24/COMP4432ML-DataProduct-A_Million_News_Headlines/main/raw_data/abcnews-date-text.csv'

# Load the headlines dataset
df = pd.read_csv(url)
df.head(10)

# Sample size
sampleSize = len(df) // 50   # 5%
analyze_random_state = 4432
sample_df = df.sample(n = 10000, random_state = analyze_random_state)
sample_df.shape



(10000, 2)

In [4]:
# Preprocess the data
max_words = 5000  # reduce the vocabulary size
max_len = 10  # reduce the sequence length
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(sample_df['headline_text'])
sequences = tokenizer.texts_to_sequences(sample_df['headline_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')
target = tf.keras.utils.to_categorical(padded_sequences, num_classes=max_words)

# Define a generator function to yield batches of data
def data_generator(data, target, max_len, batch_size):
    while True:
        indices = np.random.choice(len(data), batch_size)
        batch = data[indices]
        batch_target = target[indices]
        yield batch, batch_target

# Create tf.data dataset
batch_size = 16
dataset = tf.data.Dataset.from_generator(
    data_generator,
    args=[padded_sequences, target, max_len, batch_size],
    output_types=(tf.int32, tf.int32),
    output_shapes=([None, max_len], [None, max_words])
)

# Define the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=max_words, output_dim=32, input_length=max_len),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(max_words, activation='softmax')
])


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [7]:
# Generate new headlines based on input keywords
def generate_headline(model, tokenizer, keywords, max_len, num_words=1):
    # Convert the keywords to a sequence of integers
    keyword_seq = tokenizer.texts_to_sequences([keywords])[0]
    # Pad the sequence to match the maximum length of the training data
    padded_seq = pad_sequences([keyword_seq], maxlen=max_len, padding='post')
    # Generate the next words in the sequence using the model
    for i in range(num_words):
        next_word_index = np.argmax(model.predict(padded_seq)[0])
        # Convert the integer back to a word using the tokenizer
        next_word = tokenizer.index_word[next_word_index]
        # Append the next word to the sequence
        padded_seq = np.append(padded_seq[:, 1:], [[next_word_index]], axis=1)  # remove the first word from the sequence
    # Convert the sequence of integers back to a sentence using the tokenizer
    generated_headline = tokenizer.sequences_to_texts(padded_seq)[0]
    # Return the generated headline
    return generated_headline

In [10]:
# Generate a new headline based on list of input keywords
generated_headline = generate_headline(model, tokenizer, ['man', 'car'], max_len, num_words=5)
print(generated_headline)

super herald aust aust motorbike
