In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle

In [None]:
from google.colab import files

uploaded = files.upload()


Saving features.npy to features.npy
Saving labels.npy to labels.npy


In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
df = pd.read_csv('/content/drive/MyDrive/bbc_news.csv', encoding_errors='replace')

df=df[['title','description']].dropna()

In [4]:
df.head()

Unnamed: 0,title,description
0,Ukraine: Angry Zelensky vows to punish Russian...,The Ukrainian president says the country will ...
1,War in Ukraine: Taking cover in a town under a...,"Jeremy Bowen was on the frontline in Irpin, as..."
2,Ukraine war 'catastrophic for global food',One of the world's biggest fertiliser firms sa...
3,Manchester Arena bombing: Saffie Roussos's par...,The parents of the Manchester Arena bombing's ...
4,Ukraine conflict: Oil price soars to highest l...,Consumers are feeling the impact of higher ene...


In [5]:
def clean_text(text):
    text=text.lower()
    text=text.replace('\n',' ').replace('/r',' ')
    return ' '.join(text.split())

df['title']=df['title'].apply(clean_text)
df['description']=df['description'].apply(clean_text)

In [None]:
df.head()

In [6]:
df['text']=df['title']+ ' '+df['description']
corpus=df['text'].tolist()
corpus[:5]

['ukraine: angry zelensky vows to punish russian atrocities the ukrainian president says the country will not forgive or forget those who murder its civilians.',
 'war in ukraine: taking cover in a town under attack jeremy bowen was on the frontline in irpin, as residents came under russian fire while trying to flee.',
 "ukraine war 'catastrophic for global food' one of the world's biggest fertiliser firms says the conflict could deliver a shock to food supplies.",
 "manchester arena bombing: saffie roussos's parents on hearing the truth the parents of the manchester arena bombing's youngest victim speak about their life since she died.",
 'ukraine conflict: oil price soars to highest level since 2008 consumers are feeling the impact of higher energy costs as fuel prices and household bills jump.']

In [7]:
vectorizer = tf.keras.layers.TextVectorization()
vectorizer.adapt(corpus)

In [None]:
total_words = len(vectorizer.get_vocabulary())
total_words

In [None]:
for i,j in enumerate(vectorizer.get_vocabulary()):
    print (f'{i} : {j}')

In [None]:
input_sequences = []

for line in corpus:
    tokens = vectorizer(tf.constant([line]))[0].numpy()
    for i in range(1, len(tokens)):
        n_gram_seq = tokens[:i+1]
        input_sequences.append(n_gram_seq.tolist())



In [None]:
max_sequence_len = max(len(seq) for seq in input_sequences)
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(input_sequences, maxlen=max_sequence_len,padding='pre')

In [None]:
features = padded_sequences[:,:-1]
labels = padded_sequences[:,-1]
labels = tf.convert_to_tensor(labels, dtype=tf.int64)

In [None]:
np.save("features.npy", features)
np.save("labels.npy", labels)
with open("metadata.pkl", "wb") as f:
    pickle.dump({
        "max_sequence_len": max_sequence_len,
        "total_words": total_words
    }, f)

In [9]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint

import pickle
import os
import time

In [10]:
labels = np.load("/content/drive/MyDrive/labels.npy")

In [11]:
features = np.load("/content/drive/MyDrive/features.npy")

In [12]:
features.shape

(1099041, 57)

In [13]:
with open("/content/drive/MyDrive/metadata.pkl", "rb") as f:
    meta = pickle.load(f)
    max_sequence_len = meta["max_sequence_len"]
    total_words = meta["total_words"]

In [14]:
dataset=tf.data.Dataset.from_tensor_slices((features,labels))
dataset = dataset.cache().shuffle(10000).batch(1024).prefetch(tf.data.AUTOTUNE)

In [17]:
features.shape

(1099041, 57)

In [18]:
checkpoint_dir = '/content/drive/MyDrive/my_model_weights'
os.makedirs(checkpoint_dir, exist_ok=True)

In [19]:
EMBEDDING_DIM = 64

In [20]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=total_words, output_dim=EMBEDDING_DIM),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(total_words,activation='softmax')
])

In [21]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [22]:
checkpoint_callback = ModelCheckpoint(
    filepath='/content/drive/MyDrive/my_model_weights/epoch_{epoch:04d}.weights.h5',
    save_weights_only=True,
    save_freq='epoch',
    verbose=1
)


In [23]:
model.build(input_shape=(None, max_sequence_len - 1))


In [24]:
model.load_weights('/content/drive/MyDrive/my_model_weights/epoch_0067.weights.h5')


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
history = model.fit(
    dataset,
    epochs=20,
    callbacks=[checkpoint_callback],
  batch_size=256
)

Epoch 1/20
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391ms/step - accuracy: 0.7486 - loss: 1.0340
Epoch 1: saving model to /content/drive/MyDrive/my_model_weights/epoch_0001.weights.h5
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 392ms/step - accuracy: 0.7486 - loss: 1.0339
Epoch 2/20
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391ms/step - accuracy: 0.7507 - loss: 1.0253
Epoch 2: saving model to /content/drive/MyDrive/my_model_weights/epoch_0002.weights.h5
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 392ms/step - accuracy: 0.7507 - loss: 1.0253
Epoch 3/20
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391ms/step - accuracy: 0.7489 - loss: 1.0279
Epoch 3: saving model to /content/drive/MyDrive/my_model_weights/epoch_0003.weights.h5
[1m1074/1074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m421s[0m 391ms/step - accuracy: 0.7490 - loss: 1.0278
Epoch 4/20
[1m1074/10

In [25]:
def generate_text(seed_text, next_words=30):
    for _ in range(next_words):

        token_list = vectorizer([seed_text])[0].numpy()
        token_list = tf.keras.preprocessing.sequence.pad_sequences(
            [token_list], maxlen=max_sequence_len - 1, padding='pre')


        predicted_probs = model.predict(token_list, verbose=0)
        predicted_id = np.argmax(predicted_probs, axis=-1)[0]


        predicted_word = vectorizer.get_vocabulary()[predicted_id]


        seed_text += " " + predicted_word

    return seed_text


In [40]:
print(generate_text("zelensky vows to punish russian troops"))

zelensky vows to punish russian troops after attack was in the army have said they will move move quickly punishments is being considered and taken by the military forces officials said its important for to response the russian link to russia says


In [39]:
print(generate_text("redbull star max verstappen secures another pole position"))

redbull star max verstappen secures another pole position time max verstappen believes his provisional lead the miami he said the lap was great but also not the best best race will continue tomorrow after todays race ended fans clapped for him and cheered loudly more


In [42]:
print(generate_text("pep guardiola says the team must remain focused"))

pep guardiola says the team must remain focused on liverpools felt raising himself as they slump to 10 games with a game against arsenal mistakes happen when not focus properly
