In [None]:
import pandas as pd
import numpy as np
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("vstepanenko/disaster-tweets")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'disaster-tweets' dataset.
Path to dataset files: /kaggle/input/disaster-tweets


In [None]:
import pandas as pd
df= pd.read_csv("/root/.cache/kagglehub/datasets/vstepanenko/disaster-tweets/versions/3/tweets.csv")

In [None]:
data = df['text'].astype(str).tolist()[:500]

In [None]:
def clean_text(text):
    text = "".join(v for v in text if v not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii", 'ignore')
    return text

In [None]:
import string
corpus = [clean_text(x) for x in data]

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [None]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [None]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [None]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [None]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    LSTM(150, return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [None]:
print("Starting training on Disaster Tweets...")
model.fit(X, y, epochs=50, verbose=1)

Starting training on Disaster Tweets...
Epoch 1/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 79ms/step - accuracy: 0.0460 - loss: 6.7486
Epoch 2/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 87ms/step - accuracy: 0.0461 - loss: 6.6335
Epoch 3/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 81ms/step - accuracy: 0.0486 - loss: 6.5316
Epoch 4/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 86ms/step - accuracy: 0.0552 - loss: 6.4483
Epoch 5/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 82ms/step - accuracy: 0.0539 - loss: 6.3061
Epoch 6/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 80ms/step - accuracy: 0.0565 - loss: 6.2298
Epoch 7/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 85ms/step - accuracy: 0.0622 - loss: 6.1012
Epoch 8/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 80ms/step - accuracy: 0.0645

<keras.src.callbacks.history.History at 0x7ed3c06007a0>

In [None]:
def generate_tweet_text(seed_text, next_words=3):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)
        predicted_index = np.argmax(predicted_probs, axis=1)[0]

        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text


In [None]:
print("\n--- Model Predictions ---")
print(generate_tweet_text("Forest fire", next_words=5))
print(generate_tweet_text("Emergency", next_words=4))


--- Model Predictions ---
Forest fire in gauteng people go to
Emergency sets cars ablaze at
