# **Using LSTM to understand news articles and completing sentences**

#### **Author: Partha Seetala**

**Video Tutorial: https://www.youtube.com/watch?v=IVTZ-v4qURY**

In [None]:
!pip install requests readability-lxml beautifulsoup4

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

import argparse
import requests
from readability import Document
from bs4 import BeautifulSoup
import numpy as np
from nltk.tokenize import sent_tokenize
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

Collecting readability-lxml
  Downloading readability_lxml-0.8.4.1-py3-none-any.whl.metadata (4.0 kB)
Collecting cssselect (from readability-lxml)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting lxml_html_clean (from lxml[html_clean]->readability-lxml)
  Downloading lxml_html_clean-0.4.2-py3-none-any.whl.metadata (2.4 kB)
Downloading readability_lxml-0.8.4.1-py3-none-any.whl (19 kB)
Downloading cssselect-1.3.0-py3-none-any.whl (18 kB)
Downloading lxml_html_clean-0.4.2-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean, cssselect, readability-lxml
Successfully installed cssselect-1.3.0 lxml_html_clean-0.4.2 readability-lxml-0.8.4.1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


## Download a new article from the internet (our training dataset)

In [None]:
def fetch_article_text(url):
    # 1. Download the page
    resp = requests.get(url)
    resp.raise_for_status()

    # 2. Use readability to isolate the main article HTML
    doc = Document(resp.text)
    article_html = doc.summary()

    # 3. Parse with BeautifulSoup and remove unwanted tags
    soup = BeautifulSoup(article_html, 'html.parser')
    for tag in soup(['script', 'style', 'header', 'footer', 'nav', 'aside']):
        tag.decompose()

    # 4. Get plain text, collapse multiple blank lines
    text = soup.get_text(separator='\n')
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return '\n'.join(lines)


url = 'https://www.whitehouse.gov/briefings-statements/2025/02/united-states-india-joint-leaders-statement/'
training_text = fetch_article_text(url)
print(training_text)

United States-India Joint Leaders’ Statement
The President of the United States of America, The Honorable Donald J. Trump hosted the Prime Minister of India, Shri Narendra Modi for an Official Working Visit in Washington, DC on February 13, 2025.
As the leaders of sovereign and vibrant democracies that value freedom, the rule of law, human rights, and pluralism, President Trump and Prime Minister Modi reaffirmed the strength of the India-U.S. Comprehensive Global Strategic Partnership, anchored in mutual trust, shared interests, goodwill and robust engagement of their citizens.
Today, President Trump and Prime Minister Modi launched a new initiative – the
“U.S.-India COMPACT (Catalyzing Opportunities for Military Partnership, Accelerated Commerce & Technology) for the 21st Century”
– to drive transformative change across key pillars of cooperation. Under this initiative, they committed to a results-driven agenda with initial outcomes this year to demonstrate the level of trust for a mu

## Break long article into individual sentences and Tokenize the text

In [None]:
sentences = sent_tokenize(training_text)

MAX_SENTENCE_LENGTH = 30
for sentence in sentences:
    MAX_SENTENCE_LENGTH = max(MAX_SENTENCE_LENGTH, len(sentence.split()))

print("Max sentence length: ", MAX_SENTENCE_LENGTH)


tokenizer = Tokenizer()
tokenizer.fit_on_texts([training_text])
total_words = len(tokenizer.word_index) + 1
print("Total number of unique tokens: ", total_words)
print("Tokens: ", tokenizer.word_index)

Max sentence length:  68
Total number of unique tokens:  1050
Tokens:  {'and': 1, 'the': 2, 'to': 3, 'of': 4, 'in': 5, 'leaders': 6, 's': 7, 'u': 8, 'india': 9, 'for': 10, 'a': 11, '\xa0\xa0': 12, '\xa0the': 13, 'as': 14, 'indian': 15, 'that': 16, 'defense': 17, 'cooperation': 18, 'this': 19, 'energy': 20, 'new': 21, 'with': 22, 'trade': 23, 'on': 24, 'also': 25, 'security': 26, 'partnership': 27, 'committed': 28, 'their': 29, 'by': 30, 'critical': 31, 'including': 32, 'will': 33, 'space': 34, 'between': 35, 'technology': 36, 'systems': 37, 'technologies': 38, 'united': 39, 'states': 40, 'strategic': 41, 'year': 42, 'announced': 43, 'welcomed': 44, 'bilateral': 45, 'work': 46, 'investments': 47, 'both': 48, 'collaboration': 49, 'global': 50, 'initiative': 51, '–': 52, 'across': 53, 'ocean': 54, 'advanced': 55, 'countries': 56, 'they': 57, 'advance': 58, 'strengthen': 59, 'maritime': 60, 'enhance': 61, 'supply': 62, 'importance': 63, 'industry': 64, 'resolved': 65, 'nuclear': 66, 'trump

## Generate sequences from the sentences

In [None]:
def generate_sequences_from_text_data(tokenizer, sentences):
    # STEP #1: Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)

    x = []
    y = []

    for sequence in sequences:
        for i in range(1, len(sequence)):
            x.append(sequence[:i])
            y.append(sequence[i])

    # STEP #2: Pad sequences
    x = keras.preprocessing.sequence.pad_sequences(x, padding='pre', maxlen=MAX_SENTENCE_LENGTH)

    # STEP #3: prepare ground-truth value Ytrue
    vocab_size = len(tokenizer.word_index) + 1
    ytrue = to_categorical(y, num_classes=vocab_size)

    return x, ytrue

x, ytrue = generate_sequences_from_text_data(tokenizer, sentences)

In [None]:
print(x[1])
print(ytrue[1])

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 39 40]
[0. 0. 0. ... 0. 0. 0.]


## Built the LSTM Neural Network Model

In [None]:
EMBEDDING_DIM = 300

lstm = Sequential()
lstm.add(Embedding(total_words, EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH))
lstm.add(LSTM(150))
lstm.add(Dense(total_words, activation='softmax'))
lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(lstm.summary())


None


In [None]:
lstm.fit(x, ytrue, epochs=100, batch_size=30, verbose=1)

Epoch 1/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9697 - loss: 0.0776
Epoch 2/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9719 - loss: 0.0835
Epoch 3/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9687 - loss: 0.0753
Epoch 4/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.9703 - loss: 0.0729
Epoch 5/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9609 - loss: 0.0906
Epoch 6/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9615 - loss: 0.0890
Epoch 7/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9666 - loss: 0.0865
Epoch 8/100
[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9646 - loss: 0.0888
Epoch 9/100
[1m103/103[0m [32

<keras.src.callbacks.history.History at 0x78a714cf3490>

In [None]:
lstm.summary()

In [None]:
def complete_sentence(model, tokenizer, seed_text, num_tokens=3):

    # STEP 1: Same as during training (convert the seed_text into sequences)
    sequence = tokenizer.texts_to_sequences([seed_text])[0]

    print(seed_text, end="")

    for _ in range(num_tokens):
        # STEP 2: Pad the sequence
        padded_seq = keras.preprocessing.sequence.pad_sequences([sequence], maxlen=MAX_SENTENCE_LENGTH, padding='pre')

        # STEP 3: Predict next token probabilities
        ypred = model.predict(padded_seq, verbose=0)

        # STEP 4: Get the most likely token
        tokenid = np.argmax(ypred[0])

        # STEP 5: Convert token to word
        for word, index in tokenizer.word_index.items():
            if index == tokenid:
                print(" " + '\033[1m' + word + '\033[0m', end="")
                break

        # STEP 6: Update token_list for next prediction
        sequence.append(tokenid)
        sequence = sequence[-MAX_SENTENCE_LENGTH:]  # Keep only last maxlen tokens
    print()

prompt = "modi"
complete_sentence(lstm, tokenizer, prompt, num_tokens=20)

modi [1malso[0m [1mexpressed[0m [1mappreciation[0m [1mfor[0m [1mu[0m [1ms[0m [1mmeasures[0m [1mtaken[0m [1mto[0m [1menhance[0m [1mexports[0m [1mof[0m [1mindian[0m [1mmangoes[0m [1mand[0m [1mpomegranates[0m [1mto[0m [1mthe[0m [1munited[0m [1mstates[0m


In [None]:
complete_sentence(lstm, tokenizer, "modi", num_tokens=10)

modi [1malso[0m [1mexpressed[0m [1mappreciation[0m [1mfor[0m [1mu[0m [1ms[0m [1mmeasures[0m [1mtaken[0m [1mto[0m [1menhance[0m


In [None]:
complete_sentence(lstm, tokenizer, "trump", num_tokens=20)


trump [1mand[0m [1mprime[0m [1mminister[0m [1mmodi[0m [1mpledged[0m [1mto[0m [1msustain[0m [1mhigh[0m [1mlevel[0m [1mengagement[0m [1mbetween[0m [1mour[0m [1mgovernments[0m [1mindustries[0m [1mand[0m [1macademic[0m [1minstitutions[0m [1mand[0m [1mrealize[0m [1mtheir[0m
