In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
url = 'https://raw.githubusercontent.com/lucascheng24/COMP4432ML-DataProduct-A_Million_News_Headlines/main/raw_data/abcnews-date-text.csv'

# Load the headlines dataset
df = pd.read_csv(url)

In [3]:
df.head(10)

Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting lic...
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers
5,20030219,ambitious olsson wins triple jump
6,20030219,antic delighted with record breaking barca
7,20030219,aussie qualifier stosur wastes four memphis match
8,20030219,aust addresses un security council over iraq
9,20030219,australia is locked into war timetable opp


In [5]:
# Sample size
sampleSize = len(df) // 20   # 5%
analyze_random_state = 4432

print("sampleSize: ", sampleSize)

sampleSize:  62209


In [6]:
sample_df = df.sample(n = sampleSize, random_state = analyze_random_state)

In [7]:
sample_df.shape

(62209, 2)

In [8]:
# Preprocess the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sample_df['headline_text'])
sequences = tokenizer.texts_to_sequences(sample_df['headline_text'])
max_len = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

In [9]:
padded_sequences

array([[   26,    83,     1, ...,     0,     0,     0],
       [10630,  1657,  2370, ...,     0,     0,     0],
       [ 6035,  1270,   115, ...,     0,     0,     0],
       ...,
       [ 8856,   121,   275, ...,     0,     0,     0],
       [ 3690,  3127,   209, ...,     0,     0,     0],
       [ 1163,   207,  5954, ...,     0,     0,     0]])

In [11]:
padded_sequences.shape

(62209, 15)

In [13]:
# Define the RNN model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=max_len),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(len(tokenizer.word_index)+1, activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

one_hot_targets = tf.keras.utils.to_categorical(padded_sequences, num_classes=len(tokenizer.word_index)+1)
integer_targets = padded_sequences.astype('int32')
# Train the model
# model.fit(padded_sequences, one_hot_targets, epochs=10)
model.fit(padded_sequences, integer_targets, epochs=10)

MemoryError: Unable to allocate 110. GiB for an array with shape (933135, 31685) and data type float32

In [None]:
# Generate new headlines based on input keywords
def generate_headline(model, tokenizer, keywords):
    # Convert the keywords to a sequence of integers
    keyword_seq = tokenizer.texts_to_sequences([keywords])[0]
    # Pad the sequence to match the maximum length of the training data
    padded_seq = pad_sequences([keyword_seq], maxlen=max_len, padding='post')
    # Predict the next word in the sequence using the model
    next_word_index = np.argmax(model.predict(padded_seq)[0])
    # Convert the integer back to a word using the tokenizer
    next_word = tokenizer.index_word[next_word_index]
    # Return the generated headline
    return keywords + ' ' + next_word

In [None]:
# Generate a new headline based on the input keyword "fire"
generated_headline = generate_headline(model, tokenizer, 'fire')
print(generated_headline)