# PROBLEM 1: Topic Models

In [1]:
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import os
import re

In [2]:
# -------------------------------
# Data Loading and Preprocessing
# -------------------------------

# Fetch the complete 20 Newsgroups dataset while removing headers, footers, and quotes
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
raw_documents = newsgroups_data.data         # List of raw text documents
document_labels = newsgroups_data.target       # Corresponding labels for documents

# Print the total number of documents retrieved
print(f"Number of documents: {len(raw_documents)}")

# Define a set of English stop words for filtering tokens
stop_words_set = set(stopwords.words('english'))

def preprocess_document(document):
    """
    Preprocess a single text document by:
      - Converting the text to lowercase.
      - Tokenizing the text into words.
      - Removing tokens that are non-alphabetic.
      - Removing common stop words.
      
    Returns:
        A single string where tokens are joined by spaces.
    """
    # Tokenize the lowercased document
    tokens = word_tokenize(document.lower())
    # Keep only alphabetic tokens (remove numbers, punctuation, etc.)
    tokens = [token for token in tokens if token.isalpha()]
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words_set]
    # Return the processed tokens as a single string
    return ' '.join(tokens)

# Preprocess all documents in the dataset
preprocessed_documents = [preprocess_document(doc) for doc in raw_documents]

# Print the number of preprocessed documents (should match the raw count)
print(f"Number of preprocessed documents: {len(preprocessed_documents)}")

# -------------------------------
# Feature Extraction
# -------------------------------

# Create a CountVectorizer to convert text documents into a document-term matrix
vectorizer = CountVectorizer()
document_term_matrix = vectorizer.fit_transform(preprocessed_documents)
# Retrieve the feature names (i.e., the words)
feature_names = vectorizer.get_feature_names_out()

# -------------------------------
# Topic Modeling Display Function
# -------------------------------

def display_top_words(topic_model, feature_names, n_top_words):
    """
    Print the top n words for each topic in the topic model.
    
    Parameters:
        topic_model: A fitted topic model (e.g., LDA) with a 'components_' attribute.
        feature_names: Array of feature names corresponding to the columns of the document-term matrix.
        n_top_words: The number of top words to display per topic.
    """
    for topic_idx, topic in enumerate(topic_model.components_):
        print(f"Topic #{topic_idx + 1}:")
        # Identify the indices of the top words for this topic
        top_word_indices = topic.argsort()[-n_top_words:][::-1]
        # Print each word and its corresponding weight
        for idx in top_word_indices:
            word = feature_names[idx]
            weight = topic[idx]
            print(f"{word}: {weight:.2f}")
        print()  # Newline for better readability between topics

Number of documents: 18846
Number of preprocessed documents: 18846


In [3]:
# -------------------------------
# LDA Topic Modeling
# -------------------------------

# Define the number of top words to display for each topic and the different topic counts to explore
number_of_top_words = 20
topic_counts = [10, 20, 50]

# Run the LDA topic modeling for different numbers of topics

print(f"\nLatent Dirichlet Allocation (LDA) with {topic_counts[0]} topics")
lda_model = LatentDirichletAllocation(n_components=topic_counts[0], random_state=0)
lda_model.fit(document_term_matrix)
display_top_words(lda_model, feature_names, number_of_top_words)


Latent Dirichlet Allocation (LDA) with 10 topics
Topic #1:
god: 2056.11
jesus: 1176.80
christ: 676.90
one: 555.39
church: 538.18
lord: 516.38
sin: 410.32
bible: 406.49
also: 366.23
father: 338.83
said: 320.87
faith: 314.09
man: 304.59
john: 303.49
spirit: 293.81
son: 284.34
us: 270.29
may: 268.53
shall: 262.98
holy: 250.27

Topic #2:
people: 900.55
armenian: 839.90
israel: 749.32
government: 740.28
turkish: 658.68
war: 633.73
armenians: 633.54
state: 620.78
states: 614.71
jews: 613.22
gun: 555.22
university: 469.74
new: 466.34
national: 465.42
united: 461.91
one: 461.78
israeli: 457.16
said: 436.01
world: 409.99
american: 399.45

Topic #3:
would: 1372.58
use: 1339.06
also: 1299.70
information: 1243.69
available: 1226.36
key: 1221.85
space: 1217.70
data: 1129.48
one: 1057.93
system: 1049.26
may: 958.74
send: 825.21
list: 819.82
used: 768.41
get: 760.79
new: 743.05
mail: 720.49
could: 716.93
please: 712.11
like: 681.15

Topic #4:
new: 475.04
san: 252.88
april: 238.56
adl: 201.69
art: 18

In [4]:
print(f"\nLatent Dirichlet Allocation (LDA) with {topic_counts[1]} topics")
lda_model = LatentDirichletAllocation(n_components=topic_counts[1], random_state=0)
lda_model.fit(document_term_matrix)
display_top_words(lda_model, feature_names, number_of_top_words)


Latent Dirichlet Allocation (LDA) with 20 topics
Topic #1:
station: 96.63
said: 88.75
april: 86.81
space: 85.38
could: 71.34
turkey: 63.76
redesign: 55.79
may: 48.91
three: 47.85
command: 47.84
one: 46.44
bob: 46.12
visual: 45.20
option: 44.73
also: 43.46
status: 41.80
university: 41.09
following: 40.03
options: 39.16
article: 36.47

Topic #2:
armenian: 883.30
armenians: 676.88
turkish: 658.99
people: 630.60
jews: 607.67
war: 571.04
government: 465.04
world: 367.19
history: 361.14
muslim: 343.86
russian: 339.33
genocide: 338.05
muslims: 337.31
armenia: 336.99
university: 330.93
population: 327.48
state: 322.89
turkey: 320.74
turks: 312.05
states: 310.95

Topic #3:
available: 1151.26
key: 1112.40
information: 1107.56
use: 956.60
also: 889.25
list: 851.88
send: 828.13
get: 700.49
ftp: 694.05
system: 691.38
mail: 682.64
would: 670.07
please: 650.71
may: 648.31
data: 620.17
number: 612.80
file: 596.93
message: 567.17
files: 535.83
code: 534.98

Topic #4:
government: 625.77
new: 412.64
enc

In [5]:
print(f"\nLatent Dirichlet Allocation (LDA) with {topic_counts[2]} topics")
lda_model = LatentDirichletAllocation(n_components=topic_counts[2], random_state=0)
lda_model.fit(document_term_matrix)
display_top_words(lda_model, feature_names, number_of_top_words)


Latent Dirichlet Allocation (LDA) with 50 topics
Topic #1:
ra: 116.56
slave: 114.01
master: 94.47
drive: 90.37
jumper: 57.30
votes: 41.57
vote: 41.47
led: 37.03
pin: 35.43
water: 30.74
dreams: 29.86
heads: 28.22
jumpers: 27.92
wright: 26.24
single: 25.36
conner: 25.18
swap: 25.04
tom: 24.15
type: 23.71
set: 21.39

Topic #2:
turkish: 238.42
jews: 238.27
muslims: 235.10
genocide: 230.45
armenian: 223.07
muslim: 192.74
armenians: 156.71
nazi: 154.47
nazis: 147.93
german: 129.84
government: 102.19
serdar: 97.36
million: 91.64
argic: 89.82
history: 89.08
book: 84.81
germany: 80.54
apr: 80.28
people: 79.34
war: 78.05

Topic #3:
key: 1007.41
message: 322.87
des: 290.49
keys: 265.17
one: 226.99
number: 208.95
chip: 199.95
algorithm: 199.63
used: 196.18
encryption: 186.03
use: 178.09
could: 178.01
would: 163.92
pgp: 160.22
block: 154.26
rsa: 151.57
two: 149.66
bits: 148.63
encrypted: 147.52
public: 139.53

Topic #4:
dos: 143.08
oil: 113.14
money: 93.86
cd: 82.75
russia: 80.70
official: 66.91
l

In [6]:
print(f"\nNMF with K={topic_counts[0]} Topics")
nmf = NMF(n_components=topic_counts[0], random_state=0, init='nndsvd')
nmf.fit(document_term_matrix)
display_top_words(nmf, feature_names, number_of_top_words)


NMF with K=10 Topics
Topic #1:
max: 39.16
bhj: 3.12
giz: 3.00
gk: 1.51
bj: 1.31
wm: 1.17
qax: 1.05
kn: 0.92
ax: 0.89
nrhj: 0.83
ql: 0.78
lj: 0.76
uy: 0.69
biz: 0.69
mr: 0.66
qq: 0.62
ghj: 0.61
km: 0.58
nuy: 0.58
lg: 0.55

Topic #2:
available: 7.17
data: 5.82
system: 5.79
also: 5.60
use: 5.04
software: 4.97
image: 4.65
ftp: 4.36
version: 4.30
server: 4.03
graphics: 3.76
information: 3.62
get: 3.61
window: 3.28
files: 3.27
program: 2.91
set: 2.90
sun: 2.89
display: 2.81
windows: 2.74

Topic #3:
db: 22.38
mov: 5.58
cs: 3.50
bh: 2.97
byte: 2.45
al: 1.96
si: 1.96
di: 1.71
bl: 1.55
bits: 1.51
cx: 1.47
push: 1.14
pop: 1.06
one: 1.05
inc: 0.98
offset: 0.90
ptr: 0.77
loop: 0.69
assembled: 0.69
bx: 0.65

Topic #4:
people: 6.33
one: 6.30
said: 5.02
would: 4.92
us: 4.72
know: 4.15
could: 4.10
like: 3.08
went: 2.90
even: 2.62
go: 2.62
say: 2.59
time: 2.51
armenians: 2.45
see: 2.31
came: 2.30
something: 2.25
started: 2.21
going: 2.13
think: 2.08

Topic #5:
jpeg: 12.95
image: 7.30
gif: 5.21
file: 5.

In [7]:
print(f"\nNMF with K={topic_counts[1]} Topics")
nmf = NMF(n_components=topic_counts[1], random_state=0, init='nndsvd')
nmf.fit(document_term_matrix)
display_top_words(nmf, feature_names, number_of_top_words)


NMF with K=20 Topics
Topic #1:
max: 39.16
bhj: 3.12
giz: 3.00
gk: 1.51
bj: 1.31
wm: 1.17
qax: 1.05
kn: 0.92
ax: 0.89
nrhj: 0.83
ql: 0.78
lj: 0.76
uy: 0.69
biz: 0.69
mr: 0.66
qq: 0.62
ghj: 0.61
km: 0.58
nuy: 0.58
lg: 0.55

Topic #2:
use: 7.80
window: 7.57
widget: 6.74
available: 6.46
also: 6.42
get: 6.15
subject: 5.80
openwindows: 5.34
server: 4.96
set: 4.43
sun: 4.43
application: 4.21
version: 4.16
motif: 4.10
file: 3.88
xt: 3.54
using: 3.46
look: 3.40
information: 3.36
may: 3.35

Topic #3:
db: 22.39
mov: 5.58
cs: 3.50
bh: 2.97
byte: 2.45
al: 1.96
si: 1.96
di: 1.71
bl: 1.55
bits: 1.51
cx: 1.47
push: 1.14
pop: 1.06
one: 1.05
inc: 0.98
offset: 0.90
ptr: 0.77
loop: 0.69
assembled: 0.69
bx: 0.65

Topic #4:
god: 9.27
one: 4.72
jesus: 4.60
would: 4.25
people: 3.96
many: 2.60
believe: 2.43
even: 2.19
atheists: 1.98
good: 1.92
say: 1.91
see: 1.90
may: 1.89
also: 1.85
bible: 1.84
must: 1.79
christian: 1.74
like: 1.73
matthew: 1.69
way: 1.68

Topic #5:
jpeg: 13.17
image: 6.77
file: 5.46
gif: 5.

In [8]:
print(f"\nNMF with K={topic_counts[2]} Topics")
nmf = NMF(n_components=topic_counts[2], random_state=0, init='nndsvd')
nmf.fit(document_term_matrix)
display_top_words(nmf, feature_names, number_of_top_words)


NMF with K=50 Topics
Topic #1:
max: 39.48
bhj: 2.12
giz: 1.60
bj: 1.03
gk: 1.00
qax: 0.90
kn: 0.68
qq: 0.53
nrhj: 0.51
km: 0.48
lj: 0.45
uy: 0.43
biz: 0.41
ghj: 0.40
yd: 0.40
ax: 0.40
bhjn: 0.38
nuy: 0.33
tg: 0.31
wwiz: 0.28

Topic #2:
data: 19.83
available: 15.71
ftp: 13.31
contact: 8.06
also: 7.44
package: 6.90
graphics: 6.45
sgi: 6.23
image: 6.08
software: 5.96
research: 5.26
fax: 5.10
grass: 4.90
ibm: 4.78
systems: 4.76
format: 4.75
anonymous: 4.72
information: 4.71
sun: 4.48
visualization: 4.37

Topic #3:
db: 22.40
mov: 5.58
cs: 3.51
bh: 2.94
byte: 2.45
al: 1.96
si: 1.96
di: 1.71
bl: 1.55
bits: 1.51
cx: 1.47
push: 1.14
pop: 1.06
one: 1.04
inc: 0.98
offset: 0.90
ptr: 0.77
loop: 0.69
assembled: 0.69
bx: 0.65

Topic #4:
said: 7.50
one: 6.48
us: 6.37
know: 4.86
could: 4.69
went: 4.48
people: 4.31
came: 3.45
started: 3.40
apartment: 3.38
go: 3.38
armenians: 2.95
mamma: 2.94
something: 2.81
told: 2.74
going: 2.74
says: 2.65
saw: 2.64
say: 2.59
like: 2.53

Topic #5:
jpeg: 12.70
image: 6

In [10]:
documents_directory = 'DUC2001/'

def extract_text_from_document(file_path):
    """
    Reads an HTML document from the given file path and extracts the textual content
    contained within <text> tags.
    
    Parameters:
        file_path (str): The file path to the HTML document.
        
    Returns:
        str: A single string with all text extracted from <text> tags.
    """
    # Open the file with UTF-8 encoding and read its content
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Locate all <text> tags in the document
    text_tags = soup.find_all('text')
    
    # Extract text from each <text> tag and join them into one string
    extracted_text = ' '.join([tag.get_text() for tag in text_tags])
    
    return extracted_text

# List to hold the extracted text from each document
document_texts = []

# Iterate over each file in the directory
for file_name in os.listdir(documents_directory):
    file_path = os.path.join(documents_directory, file_name)
    # Check if the path is a file (skip directories)
    if os.path.isfile(file_path):
        # Extract text using the helper function
        text = extract_text_from_document(file_path)
        document_texts.append(text)

# Preprocess each document using the preprocess_text function (assumed to be defined elsewhere)
preprocessed_documents = [preprocess_document(text) for text in document_texts]

# Initialize a CountVectorizer to convert text documents into a term frequency matrix
count_vectorizer = CountVectorizer()

# Fit the CountVectorizer to the preprocessed documents and transform them into a matrix
term_frequency_matrix = count_vectorizer.fit_transform(preprocessed_documents)

In [13]:
number_of_top_words = 20
topic_counts = [10, 20, 50]

print(f"\nLDA with K={topic_counts[0]} Topics for DUC 2001")
lda_duc = LatentDirichletAllocation(n_components=topic_counts[0], random_state=0)
lda_duc.fit(term_frequency_matrix)
display_top_words(lda_duc, count_vectorizer.get_feature_names_out(), number_of_top_words)


LDA with K=10 Topics for DUC 2001
Topic #1:
said: 315.58
mr: 170.05
slovenia: 162.10
bank: 145.43
shining: 122.98
world: 114.08
path: 112.81
president: 109.74
new: 99.28
would: 94.30
year: 92.50
government: 92.23
last: 91.88
also: 89.24
nafta: 89.10
people: 88.13
yugoslavia: 81.96
says: 80.70
one: 74.63
countries: 72.40

Topic #2:
us: 27.18
french: 24.19
drought: 23.87
caribbean: 21.08
said: 20.19
france: 19.52
year: 17.74
farmers: 17.58
farm: 16.30
would: 13.75
prices: 13.71
could: 13.23
region: 12.95
house: 11.95
says: 11.91
lebanon: 11.10
land: 10.84
president: 10.60
basin: 9.10
exports: 9.10

Topic #3:
fire: 314.40
said: 308.93
eclipse: 146.10
forest: 137.93
fires: 126.69
national: 105.00
firefighters: 95.67
acres: 87.96
people: 80.36
area: 75.53
service: 71.27
one: 70.56
officials: 70.45
sun: 68.56
tornado: 68.10
jackson: 67.93
park: 61.18
california: 59.26
police: 56.36
department: 53.91

Topic #4:
said: 280.20
police: 198.91
tunnel: 93.73
time: 79.07
taylor: 77.10
right: 76.66


In [14]:
print(f"\nLDA with K={topic_counts[1]} Topics for DUC 2001")
lda_duc = LatentDirichletAllocation(n_components=topic_counts[1], random_state=0)
lda_duc.fit(term_frequency_matrix)
display_top_words(lda_duc, count_vectorizer.get_feature_names_out(), number_of_top_words)


LDA with K=20 Topics for DUC 2001
Topic #1:
said: 260.31
johnson: 167.85
bank: 136.38
world: 114.54
mr: 113.20
would: 77.85
new: 71.20
says: 70.82
countries: 70.59
president: 65.60
government: 64.59
year: 63.61
last: 61.80
people: 56.44
lewis: 54.94
also: 54.67
debt: 53.91
ben: 47.54
two: 46.11
disease: 45.98

Topic #2:
us: 23.56
caribbean: 20.95
french: 16.82
said: 14.53
region: 11.85
lebanon: 11.05
terrorist: 9.72
basin: 9.05
france: 8.97
johnson: 8.16
washington: 7.93
attack: 7.78
mexico: 7.09
uta: 7.05
parity: 7.05
bomb: 7.05
nafta: 6.92
explosion: 6.80
canada: 6.64
wednesday: 6.41

Topic #3:
said: 86.98
eclipse: 41.74
fire: 40.30
jackson: 33.24
people: 33.21
police: 31.20
area: 25.19
one: 20.58
disease: 19.47
cjd: 19.05
tuesday: 18.23
national: 18.09
long: 17.98
saturday: 17.72
acres: 17.62
says: 17.44
spokesman: 17.10
beach: 17.07
coca: 17.05
officials: 17.04

Topic #4:
said: 102.29
nra: 92.05
gun: 87.79
right: 70.82
police: 62.53
would: 53.95
arms: 49.87
amendment: 48.24
second

In [15]:
print(f"\nLDA with K={topic_counts[2]} Topics for DUC 2001")
lda_duc = LatentDirichletAllocation(n_components=topic_counts[2], random_state=0)
lda_duc.fit(term_frequency_matrix)
display_top_words(lda_duc, count_vectorizer.get_feature_names_out(), number_of_top_words)


LDA with K=50 Topics for DUC 2001
Topic #1:
said: 98.33
president: 40.32
military: 34.70
government: 33.81
two: 27.29
mr: 25.99
people: 24.23
baker: 22.56
party: 22.50
federal: 21.74
house: 21.46
pizarro: 21.02
police: 20.05
may: 20.00
forces: 19.56
officials: 18.72
morote: 18.02
presidential: 17.45
political: 17.42
country: 17.37

Topic #2:
french: 20.00
france: 11.98
lebanon: 11.02
terrorist: 7.98
attack: 7.96
uta: 7.02
washington: 6.65
officials: 5.80
wednesday: 5.57
hoffman: 5.02
intelligence: 5.01
explosion: 4.91
plane: 4.53
forces: 4.48
airport: 4.40
said: 4.23
shiite: 4.02
chad: 4.02
bomb: 4.02
obeid: 4.02

Topic #3:
said: 58.87
eclipse: 47.03
fire: 23.07
shining: 21.20
people: 20.70
sun: 20.53
guerrillas: 19.54
path: 18.69
police: 18.32
coca: 17.02
area: 16.99
army: 15.99
percent: 15.28
officials: 14.26
valley: 14.00
huallaga: 13.02
upper: 12.83
rebels: 12.63
forest: 12.59
two: 12.24

Topic #4:
gun: 25.72
yellowstone: 22.31
park: 22.29
police: 20.10
fires: 16.14
bill: 15.50
al

# PROBLEM 2: Extractive Summarization

In [18]:
import os
import numpy as np
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize

# Define the directory containing the document files
DOCUMENT_DIRECTORY = 'DUC2001/'

def extract_sentences_from_file(file_path):
    """
    Extracts text from an HTML file's <text> tags, converts the content to lowercase,
    tokenizes it into sentences, and returns the list of sentences.
    
    Parameters:
        file_path (str): The path to the HTML file.
    
    Returns:
        list: A list of sentences extracted from the file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Parse the HTML and extract text from <text> tags
    soup = BeautifulSoup(content, 'html.parser')
    text_elements = soup.find_all('text')
    combined_text = ' '.join(element.get_text() for element in text_elements).lower()
    
    # Tokenize the combined text into sentences
    sentences = sent_tokenize(combined_text)
    return sentences

def calculate_word_frequencies(sentences):
    """
    Calculates the normalized frequency of each word in a list of sentences.
    
    Parameters:
        sentences (list): A list of sentences.
    
    Returns:
        dict: A dictionary mapping each word to its normalized frequency.
    """
    words = word_tokenize(' '.join(sentences))
    word_counts = {}
    
    # Count occurrences of each word
    for word in words:
        word_counts[word] = word_counts.get(word, 0) + 1
    
    total_words = sum(word_counts.values())
    # Normalize the counts by the total number of words
    word_frequencies = {word: count / total_words for word, count in word_counts.items()}
    return word_frequencies

def calculate_kl_divergence(summary_freq, document_freq):
    """
    Computes the Kullback-Leibler (KL) divergence between the summary and document word frequency distributions.
    
    Parameters:
        summary_freq (dict): The word frequency distribution from the summary.
        document_freq (dict): The word frequency distribution from the full document.
    
    Returns:
        float: The KL divergence score.
    """
    divergence = 0
    for word, freq in summary_freq.items():
        if word in document_freq:
            divergence += freq * np.log(freq / document_freq[word])
    return divergence

def choose_best_sentence(sentences, document_freq, current_summary_sentences):
    """
    Iterates over the document's sentences to find the sentence which, when added to 
    the current summary, minimizes the KL divergence compared to the document's distribution.
    
    Parameters:
        sentences (list): The full list of document sentences.
        document_freq (dict): The word frequency distribution for the entire document.
        current_summary_sentences (list): The list of sentences already selected for the summary.
    
    Returns:
        str or None: The selected sentence that best reduces divergence, or None if no sentence improves the summary.
    """
    best_sentence = None
    min_divergence = float('inf')
    
    for sentence in sentences:
        # Skip if the sentence is already in the summary
        if sentence in current_summary_sentences:
            continue
        
        # Create a temporary summary including the new sentence
        temp_summary = current_summary_sentences + [sentence]
        temp_summary_freq = calculate_word_frequencies(temp_summary)
        divergence = calculate_kl_divergence(temp_summary_freq, document_freq)
        
        # Update the best sentence if the current one yields a lower divergence
        if divergence < min_divergence:
            min_divergence = divergence
            best_sentence = sentence
    
    return best_sentence

def generate_kl_summary(document_sentences, num_summary_sentences):
    """
    Generates a summary by iteratively selecting sentences to minimize the KL divergence 
    between the summary and the document's word frequency distributions.
    
    Parameters:
        document_sentences (list): A list of all sentences from the document.
        num_summary_sentences (int): The target number of sentences to include in the summary.
    
    Returns:
        str: The summary as a single concatenated string of selected sentences.
    """
    document_freq = calculate_word_frequencies(document_sentences)
    summary_sentences = []
    
    while len(summary_sentences) < num_summary_sentences:
        best_sentence = choose_best_sentence(document_sentences, document_freq, summary_sentences)
        if best_sentence:
            summary_sentences.append(best_sentence)
        else:
            # Stop if no more sentences contribute to reducing divergence
            break
    
    return ' '.join(summary_sentences)


In [19]:
# Process all documents in the directory: read files and extract sentences from each
documents = {}

for filename in os.listdir(DOCUMENT_DIRECTORY):
    file_path = os.path.join(DOCUMENT_DIRECTORY, filename)
    if os.path.isfile(file_path):
        documents[filename] = extract_sentences_from_file(file_path)

In [20]:
# List of document keys to test
test_document_keys = ["AP880816-0234", "AP890714-0129", "AP900424-0035"]

# Dictionary to store the KL summary for each test document
kl_summary_output = {}

# Loop through each document key, generate a summary with 5 sentences, and print the result.
for doc_key in test_document_keys:
    # Generate a summary using the 'generate_kl_summary' function on the document sentences
    summary = generate_kl_summary(documents[doc_key], 5)
    kl_summary_output[doc_key] = summary
    print(f"Summary of {doc_key}:\n{summary}\n")


Summary of AP880816-0234:
the rodrigo franco command, which has vowed to kill a shining
path member or sympathizer for every person slain by guerrillas,
issued the threat against district attorney carlos escobar on
monday, according to his office in andean city of ayacucho. officials said the rebel raids occurred sunday, at a police post
and telephone relay station near the jungle city of pucallpa, 325
miles northeast of lima. escobar is investigating charges that troops rounded up dozens
of peasants, accused them of being shining path members and killed
them. the
government says more than 15,000 people have been killed and puts
the property damage at $10 billion. 
   a death squad opposed to the shining path
guerrillas has threatened to kill a district attorney if he
investigates charges that soldiers massacred dozens of peasants,
his office said tuesday.

Summary of AP890714-0129:
firefighters on thursday said they had contained the 2,000-acre
livermore fire west of fort collins, col

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Combine the list of sentences from each document into a single string representing the full document.
full_documents = [' '.join(sentences) for sentences in documents.values()]

# Initialize a CountVectorizer to convert the documents into a term-frequency matrix,
# automatically removing common English stop words.
count_vectorizer = CountVectorizer(stop_words='english')
document_term_matrix = count_vectorizer.fit_transform(full_documents)

# Initialize and train the LDA model with 10 topics. The random state is set for reproducibility.
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda_model.fit(document_term_matrix)

# Retrieve the vocabulary (feature names) from the CountVectorizer,
# which will be used to interpret the topics.
feature_names = count_vectorizer.get_feature_names_out()

# Compute the topic distribution for each document.
# Each row represents a document and each column corresponds to a topic.
document_topic_distribution = lda_model.transform(document_term_matrix)

# Compute the normalized topic-word distributions.
# This converts the raw counts in the LDA components into probabilities for each topic.
topic_word_distribution = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]

## Generating LDA-based Word Distributions

In [None]:
def generate_lda_word_distribution(doc_topic_dist, topic_word_dist, vocabulary):
    """
    Generate a word probability distribution for a document by combining its topic distribution
    with the topic-word distributions from the LDA model.
    
    Args:
        doc_topic_dist (array-like): A 1D array representing the topic probabilities for the document.
        topic_word_dist (2D array): A 2D array where each row is a topic's normalized word distribution.
        vocabulary (array-like): List of feature names corresponding to the word columns.
    
    Returns:
        dict: A dictionary that maps each word in the vocabulary to its probability in the document.
    """
    # Compute word probabilities by taking the dot product of the document's topic distribution with the LDA topic-word matrix.
    word_probs = np.dot(doc_topic_dist, topic_word_dist)
    
    # Create a dictionary mapping each word to its corresponding probability.
    word_probability_distribution = {word: prob for word, prob in zip(vocabulary, word_probs)}
    return word_probability_distribution


def generate_lda_summary(doc_index, num_summary_sentences, doc_topic_distributions, topic_word_distributions,
                         vocabulary, all_document_sentences):
    """
    Generate a summary for a given document based on its LDA-based word distribution. The summary is built iteratively,
    adding one sentence at a time until it reaches the desired number of sentences. Each sentence is selected using a
    greedy approach that minimizes the divergence between the summary's word distribution and the document's word distribution.
    
    Args:
        doc_index (int): The index (or key-based order) of the target document.
        num_summary_sentences (int): The desired number of sentences in the summary.
        doc_topic_distributions (2D array): Matrix with each row representing a document's topic distribution.
        topic_word_distributions (2D array): Matrix with each row corresponding to a topic's normalized word distribution.
        vocabulary (array-like): List of feature names (typically from the vectorizer).
        all_document_sentences (dict or list): Collection of document sentences. If a dict is provided, its values 
            (assumed to be lists of sentences) will be converted to a list.
            
    Returns:
        str: A concatenated string containing the selected summary sentences.
    """
    # Extract the topic distribution for the specific document.
    document_topic_dist = doc_topic_distributions[doc_index]
    
    # Generate the document word distribution using the LDA model output.
    document_word_distribution = generate_lda_word_distribution(document_topic_dist, 
                                                                topic_word_distributions, 
                                                                vocabulary)
    
    # Retrieve the full list of sentences for the target document.
    if isinstance(all_document_sentences, dict):
        document_sentences = list(all_document_sentences.values())[doc_index]
    else:
        document_sentences = all_document_sentences[doc_index]
    
    selected_sentences = []
    
    # Iteratively select sentences until the summary reaches the target length.
    while len(selected_sentences) < num_summary_sentences:
        # The helper function `choose_best_sentence` should select a sentence which, when added to the summary,
        # minimizes the divergence between the summary's word distribution and the full document's word distribution.
        best_sentence = choose_best_sentence(document_sentences, document_word_distribution, selected_sentences)
        if best_sentence:
            selected_sentences.append(best_sentence)
        else:
            break
    
    # Return the summary as a single string.
    return ' '.join(selected_sentences)

## Listing All Processed Document Keys

In [24]:
# Create a list of document keys from the existing documents dictionary.
document_keys = list(documents.keys())

# Dictionary to store the generated LDA-based summaries.
lda_summaries = {}

# List of test document keys for which the LDA summary will be generated.
test_document_keys = ["AP880816-0234", "AP890714-0129", "AP900424-0035"]

# Loop through each test document key to generate and display its summary.
for doc_key in test_document_keys:
    # Find the index of the document using its key.
    doc_index = document_keys.index(doc_key)
    
    # Generate an LDA-based summary consisting of 5 sentences.
    # The function generate_lda_summary is assumed to accept the following parameters:
    #   - doc_index: the index of the target document,
    #   - num_summary_sentences: target number of summary sentences,
    #   - doc_topic_distributions: the document-topic distribution matrix (e.g. document_topic_distribution),
    #   - topic_word_distributions: the topic-word distribution matrix (e.g. topic_word_distribution),
    #   - vocabulary: feature names list from the vectorizer (e.g. feature_names), and
    #   - all_document_sentences: the dictionary or list of all document sentences (e.g. docs_sentences).
    summary = generate_lda_summary(doc_index, 5, 
                                   document_topic_distribution, 
                                   topic_word_distribution, 
                                   feature_names, 
                                   documents)
    
    # Save the summary in the dictionary with the document key.
    lda_summaries[doc_key] = summary
    
    # Print the summary for the document.
    print(f"Summary of {doc_key}:\n{summary}\n")


Summary of AP880816-0234:
he is suspected of being the
shining path second-in-command and is in jail on terrorism charges. the
government says more than 15,000 people have been killed and puts
the property damage at $10 billion. it became known in july when it claimed responsibility for
killing the lawyer for osman morote. the rodrigo franco group is named for an official of the
government party killed the shining path killed last year. police said members of shining path, a maoist group, killed two
policemen and wounded three in jungle raids.

Summary of AP890714-0129:
``for now, anyway, we are
getting some relief.'' ``my experience is that we'd see green grass again if we get
enough moisture, say a half-inch of rain,'' he said. the fire was contained monday and should
be controlled within four days, ms. garcia said. the storms
dampened a fire in bridger-teton national forest that had burned
nearly 3,500 acres. seven 20-person crews fighting the fire were to be reduced to
three crews,

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Combine the list of sentences for each document into a single full-text string.
# 'docs_sentences' is assumed to be a dictionary where each key maps to a list of sentences for that document.
full_documents = [' '.join(sentences) for sentences in documents.values()]

# Instantiate a CountVectorizer to transform the documents into a document-term frequency matrix,
# while also removing common English stop words.
vectorizer = CountVectorizer(stop_words='english')
document_term_matrix = vectorizer.fit_transform(full_documents)

# Set the desired number of topics.
num_topics = 10

# Initialize the LDA model with the specified number of topics and a fixed random state for reproducibility.
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)

# Fit the LDA model on the document-term matrix.
lda_model.fit(document_term_matrix)

# Retrieve the feature (word) names from the vectorizer.
feature_names = vectorizer.get_feature_names_out()


In [27]:
# Compute the topic distribution for each document.
# Each row of `document_topic_distribution` corresponds to a document,
# and each column corresponds to the probability of that document belonging to a particular topic.
document_topic_distribution = lda_model.transform(document_term_matrix)

# Compute the normalized topic-word distributions.
# 'lda_model.components_' contains the raw counts for each word under each topic.
# To convert these counts into probabilities, we divide each topic's raw counts
# by the total count of words for that topic.
topic_word_distribution = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]


In [30]:
# Create a list of document keys from the docs_sentences dictionary.
document_keys = list(documents.keys())

# Dictionary to store the generated LDA-based summaries.
lda_summaries = {}

# List of test document keys for which to generate summaries.
test_document_keys = ["AP880816-0234", "AP890714-0129", "AP900424-0035"]

# Loop over each document key, generate the summary, and display it.
for doc_key in test_document_keys:
    # Retrieve the index of the document by key.
    doc_index = document_keys.index(doc_key)
    
    # Generate an LDA-based summary consisting of 5 sentences.
    # The generate_lda_summary function is assumed to have the following signature:
    # generate_lda_summary(doc_index, num_summary_sentences, document_topic_distribution, 
    #                      topic_word_distribution, vocabulary, document_sentences_collection)
    summary = generate_lda_summary(doc_index, 5, 
                                   document_topic_distribution, 
                                   topic_word_distribution, 
                                   feature_names, 
                                   documents)
    
    # Store the summary in the dictionary with the document key.
    lda_summaries[doc_key] = summary
    
    # Print the summary for the document.
    print(f"Summary of {doc_key}:\n{summary}\n")


Summary of AP880816-0234:
he is suspected of being the
shining path second-in-command and is in jail on terrorism charges. the
government says more than 15,000 people have been killed and puts
the property damage at $10 billion. it became known in july when it claimed responsibility for
killing the lawyer for osman morote. the rodrigo franco group is named for an official of the
government party killed the shining path killed last year. police said members of shining path, a maoist group, killed two
policemen and wounded three in jungle raids.

Summary of AP890714-0129:
``for now, anyway, we are
getting some relief.'' ``my experience is that we'd see green grass again if we get
enough moisture, say a half-inch of rain,'' he said. the fire was contained monday and should
be controlled within four days, ms. garcia said. the storms
dampened a fire in bridger-teton national forest that had burned
nearly 3,500 acres. seven 20-person crews fighting the fire were to be reduced to
three crews,

In [31]:
import os
import re

# Define the directory that contains the gold summary files.
summaries_dir = r'DUC2001/Summaries'

# Dictionary to store the extracted gold summaries.
gold_summaries = {}

def extract_abstract(content):
    """
    Extract the abstract section from a document's content.

    The function looks for text that starts after 'Abstract:' and ends before 'Introduction:'.
    It then trims any extra whitespace and converts the text to lowercase.

    Parameters:
        content (str): The complete text content of a file.
    
    Returns:
        str or None: The extracted abstract in lowercase if found; otherwise, None.
    """
    # Use a regular expression to capture text between "Abstract:" and "Introduction:"
    match = re.search(r'Abstract:(.*?)Introduction:', content, re.DOTALL)
    if match:
        abstract_text = match.group(1).strip().lower()
        return abstract_text
    return None

# Loop through all files in the summaries directory.
for filename in os.listdir(summaries_dir):
    file_path = os.path.join(summaries_dir, filename)
    
    # Ensure we process only files (skip subdirectories, if any)
    if os.path.isfile(file_path):
        # Open and read the file's content with proper encoding.
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        
        # Extract the abstract section from the file content.
        abstract_text = extract_abstract(content)
        if abstract_text:
            # Use the filename (without its extension) as the document identifier, converted to uppercase.
            doc_id = os.path.splitext(filename)[0].upper()
            gold_summaries[doc_id] = abstract_text


In [33]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(system_summaries, gold_summaries):
    """
    Calculate the ROUGE scores of system-generated summaries against gold (reference) summaries.

    Parameters:
        system_summaries (dict): A dictionary mapping document IDs to system-generated summaries.
        gold_summaries (dict): A dictionary mapping document IDs to reference (gold) summaries.

    Returns:
        list: A list of tuples, where each tuple contains the document ID and a dictionary of ROUGE scores.
              The score dictionary includes 'rouge1', 'rouge2', and 'rougeL' scores.
    """
    # Initialize the ROUGE scorer with required metrics and enable stemming.
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []

    # Iterate through each system-generated summary.
    for doc_id, system_summary in system_summaries.items():
        if doc_id in gold_summaries:
            # Retrieve the corresponding gold summary.
            gold_summary = gold_summaries[doc_id]
            # Calculate ROUGE scores comparing the gold summary to the system summary.
            score = scorer.score(gold_summary, system_summary)
            scores.append((doc_id, score))
        else:
            print(f"Gold summary not found for document {doc_id}. Skipping evaluation.")

    return scores

# Calculate ROUGE scores for summaries generated by two different methods.
rouge_scores_kla = calculate_rouge_scores(kl_summary_output, gold_summaries)
rouge_scores_lda = calculate_rouge_scores(lda_summaries, gold_summaries)


In [34]:
def print_rouge_scores(scores, summary_type):
    """
    Prints formatted ROUGE scores for a set of system-generated summaries compared to gold summaries.
    
    Parameters:
        scores (list): A list of tuples. Each tuple contains a document ID and a dictionary 
                       of ROUGE score objects (with attributes fmeasure, precision, and recall).
        summary_type (str): A label for the type of summaries (e.g., "KLA", "LDA") being evaluated.
    """
    # Print header indicating the summary type.
    print(f"ROUGE scores for {summary_type} summaries:")
    
    # Define the header row with fixed column widths.
    header = ("{:<12} {:<12} {:<12} {:<12} {:<17} {:<17} {:<17} "
              "{:<13} {:<13} {:<13}").format(
        "Document ID", "ROUGE-1 F1", "ROUGE-2 F1", "ROUGE-L F1",
        "ROUGE-1 Precision", "ROUGE-2 Precision", "ROUGE-L Precision",
        "ROUGE-1 Recall", "ROUGE-2 Recall", "ROUGE-L Recall"
    )
    print(header)

    # Iterate through each document's scores and print in a formatted row.
    for doc_id, score in scores:
        row = ("{:<12} {:<12.4f} {:<12.4f} {:<12.4f} {:<17.4f} {:<17.4f} {:<17.4f} "
               "{:<13.4f} {:<13.4f} {:<13.4f}").format(
            doc_id,
            score['rouge1'].fmeasure,
            score['rouge2'].fmeasure,
            score['rougeL'].fmeasure,
            score['rouge1'].precision,
            score['rouge2'].precision,
            score['rougeL'].precision,
            score['rouge1'].recall,
            score['rouge2'].recall,
            score['rougeL'].recall
        )
        print(row)


# Print the ROUGE scores for each method using the refactored print function.
print_rouge_scores(rouge_scores_kla, "KLA")
print_rouge_scores(rouge_scores_lda, "LDA")


ROUGE scores for KLA summaries:
Document ID  ROUGE-1 F1   ROUGE-2 F1   ROUGE-L F1   ROUGE-1 Precision ROUGE-2 Precision ROUGE-L Precision ROUGE-1 Recall ROUGE-2 Recall ROUGE-L Recall
AP880816-0234 0.6017       0.2991       0.3729       0.5221            0.2593            0.3235            0.7100        0.3535        0.4400       
AP890714-0129 0.5469       0.3128       0.3429       0.4589            0.2621            0.2877            0.6768        0.3878        0.4242       
AP900424-0035 0.4255       0.1459       0.1532       0.3846            0.1318            0.1385            0.4762        0.1635        0.1714       
ROUGE scores for LDA summaries:
Document ID  ROUGE-1 F1   ROUGE-2 F1   ROUGE-L F1   ROUGE-1 Precision ROUGE-2 Precision ROUGE-L Precision ROUGE-1 Recall ROUGE-2 Recall ROUGE-L Recall
AP880816-0234 0.5340       0.1905       0.2199       0.5604            0.2000            0.2308            0.5100        0.1818        0.2100       
AP890714-0129 0.2584       0.0114     

In [35]:
from sklearn.datasets import fetch_20newsgroups

# Load the complete 20 newsgroups dataset while removing headers, footers, and quotes.
# This ensures that only the main content of each newsgroup post is retained.
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Extract the list of document texts from the fetched data.
documents = newsgroups.data

# Print the total number of documents in the dataset.
print("Number of documents:", len(documents))


Number of documents: 18846


In [36]:
from nltk.tokenize import sent_tokenize

def preprocess_documents(documents):
    """
    Preprocesses a list of documents by converting each document to lowercase and tokenizing it into sentences.
    
    Parameters:
        documents (list of str): A list of raw document texts.
    
    Returns:
        dict: A dictionary where the keys are document indices (integers) and the values are lists of tokenized sentences.
    """
    # Dictionary to store the sentences for each document.
    docs_sentences = {}
    
    # Iterate over the documents with an index.
    for index, document in enumerate(documents):
        # Convert the document to lowercase and tokenize into sentences.
        sentences = sent_tokenize(document.lower())
        # Store the list of sentences in the dictionary using the index as the key.
        docs_sentences[index] = sentences
    
    return docs_sentences

# Preprocess the 20 Newsgroups documents and store the result.
docs_sentences_20ng = preprocess_documents(documents)

# Print the total number of preprocessed documents.
print("Number of preprocessed documents:", len(docs_sentences_20ng))


Number of preprocessed documents: 18846


In [37]:
# Iterate over the first five preprocessed documents (represented as lists of sentences).
for doc_index in range(5):
    # Retrieve the preprocessed sentences for the current document.
    document_sentences = docs_sentences_20ng[doc_index]
    
    # Generate a summary containing 5 sentences using the kl_summary function.
    # This function should select the most representative sentences using KL divergence.
    summary = generate_kl_summary(document_sentences, 5)
    
    # Print the summary along with a header indicating the document index.
    print(f"Summary for Document {doc_index}:\n{summary}\n")


Summary for Document 0:
bowman should let jagr have a lot of
fun in the next couple of games since the pens are going to beat the pulp out of jersey anyway. however, i am going to put an end
to non-pittsburghers' relief with a bit of praise for the pens. 

i am sure some bashers of pens fans are pretty confused about the lack
of any kind of posts about the recent pens massacre of the devils. jagr just showed you why
he is much better than his regular season stats. i was very disappointed not to see the islanders lose the final
regular season game.

Summary for Document 1:
does anyone have suggestions/ideas on:

  - diamond stealth pro local bus

  - orchid farenheit 1280

  - ati graphics ultra pro

  - any other high-performance vlb card


please post or email. my brother is in the market for a high-performance video card that supports
vesa local bus with 1-2mb ram. thank you! - matt

Summary for Document 2:
shall the azeri women and children going to pay the price with
						    ****

In [38]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Convert each document (list of sentences) into a single full-text string.
# Here, docs_sentences_20ng is assumed to be a dictionary where keys are document indices 
# and values are lists of sentences.
documents_full_text = [' '.join(sentences) for sentences in docs_sentences_20ng.values()]

# Instantiate a CountVectorizer to convert documents into a document-term matrix, 
# automatically removing common English stop words.
vectorizer = CountVectorizer(stop_words='english')
document_term_matrix = vectorizer.fit_transform(documents_full_text)

# Initialize the LDA model with 10 topics and a fixed random state for reproducibility.
num_topics = 10
lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=0)
lda_model.fit(document_term_matrix)

# Obtain the topic distribution for each document.
doc_topic_distributions_20ng = lda_model.transform(document_term_matrix)

# Compute the normalized topic-word distribution matrix.
# Each row corresponds to a topic and sums to 1.
topic_word_distributions_20ng = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]

# Retrieve the vocabulary (feature names) from the vectorizer.
vocabulary = vectorizer.get_feature_names_out()

# Generate and print LDA-based summaries for the first 5 documents.
for i in range(5):
    # Generate a summary using 5 sentences from the i-th document.
    # The generate_lda_summary function is assumed to select sentences based on LDA-derived word distributions.
    summary = generate_lda_summary(i, 5,
                                   doc_topic_distributions_20ng,
                                   topic_word_distributions_20ng,
                                   vocabulary,
                                   docs_sentences_20ng)
    
    # Print a header with the document index and the summary.
    print(f"Summary for Document {i}:\n{summary}\n")


Summary for Document 0:
! however, i am going to put an end
to non-pittsburghers' relief with a bit of praise for the pens. jagr just showed you why
he is much better than his regular season stats. man, they
are killing those devils worse than i thought. he is also a lot
fo fun to watch in the playoffs.

Summary for Document 1:
my brother is in the market for a high-performance video card that supports
vesa local bus with 1-2mb ram. thank you! does anyone have suggestions/ideas on:

  - diamond stealth pro local bus

  - orchid farenheit 1280

  - ati graphics ultra pro

  - any other high-performance vlb card


please post or email. - matt

Summary for Document 2:
****************
						    ******************
			    ***************


	nothing of the mentioned is true, but let say it's true. if there is one that's confused then that's you! you don't know what you are talking about. shoot down with what? mediterranean????

Summary for Document 3:
it's the scsi card doing the dma transfe

In [39]:
import numpy as np

def compute_topic_frequencies(topic_distribution: np.ndarray) -> np.ndarray:
    """
    Normalize a topic distribution so that its elements sum to 1.
    
    If the total count is zero, returns an array of zeros with the same shape.
    
    Parameters:
        topic_distribution (np.ndarray): An array representing unnormalized topic counts.
        
    Returns:
        np.ndarray: Normalized topic distribution (sums to 1).
    """
    total_count = np.sum(topic_distribution)
    if total_count == 0:
        return np.zeros_like(topic_distribution)
    return topic_distribution / total_count

def kl_divergence_topics(summary_dist: np.ndarray, doc_dist: np.ndarray) -> float:
    """
    Compute the Kullback-Leibler (KL) divergence between two topic distributions.
    
    Only terms where both distributions have positive values contribute to the divergence.
    
    Parameters:
        summary_dist (np.ndarray): The topic distribution for the summary.
        doc_dist (np.ndarray): The topic distribution for the document.
    
    Returns:
        float: The computed KL divergence.
    """
    divergence_value = 0.0
    for i in range(len(summary_dist)):
        if summary_dist[i] > 0 and doc_dist[i] > 0:
            divergence_value += summary_dist[i] * np.log(summary_dist[i] / doc_dist[i])
    return divergence_value

def select_sentence_based_on_topic(doc_sentences: list,
                                   doc_dist: np.ndarray,
                                   summary_sentences: list,
                                   doc_index: int,
                                   doc_topic_distributions: np.ndarray) -> str:
    """
    Select the sentence that, when added to the current summary, minimizes the KL divergence 
    between the summary's topic distribution and the document's topic distribution.
    
    Parameters:
        doc_sentences (list): List of sentences in the document.
        doc_dist (np.ndarray): Normalized topic distribution of the document.
        summary_sentences (list): Currently selected summary sentences.
        doc_index (int): Index of the document in doc_topic_distributions.
        doc_topic_distributions (np.ndarray): Matrix of topic distributions for all documents.
    
    Returns:
        str or None: The sentence that minimizes divergence when added, or None if no sentence qualifies.
    """
    min_divergence = float('inf')
    selected_sentence = None

    for sentence in doc_sentences:
        if sentence in summary_sentences:
            continue  # Skip sentences already in the summary
        
        # Create a temporary summary that includes the new sentence.
        temp_summary = summary_sentences + [sentence]
        
        # Compute the average topic distribution for the temporary summary.
        # Note: As implemented, this uses the same document topic distribution for every sentence.
        # If per-sentence topic distributions are available, consider using them instead.
        temp_summary_topics = np.mean([doc_topic_distributions[doc_index] for _ in temp_summary], axis=0)
        
        # Normalize the temporary summary topic distribution.
        temp_summary_topics = compute_topic_frequencies(temp_summary_topics)
        
        # Compute the KL divergence between the temporary summary's topics and the document's topics.
        divergence = kl_divergence_topics(temp_summary_topics, doc_dist)
        
        # Update the best (lowest divergence) sentence choice.
        if divergence < min_divergence:
            min_divergence = divergence
            selected_sentence = sentence
            
    return selected_sentence

def kl_summary_20NG(doc_index: int,
                    num_sentences: int,
                    doc_topic_distributions: np.ndarray,
                    docs_sentences: dict) -> str:
    """
    Generate a summary for a document from the 20 Newsgroups dataset by selecting sentences 
    that minimize the divergence between the summary's and document's topic distributions.
    
    Parameters:
        doc_index (int): The index of the target document.
        num_sentences (int): The desired number of sentences in the summary.
        doc_topic_distributions (np.ndarray): Matrix of topic distributions for all documents.
        docs_sentences (dict): Mapping from document indices to lists of preprocessed sentences.
    
    Returns:
        str: The generated summary as a single concatenated string.
    """
    # Retrieve and normalize the document's topic distribution.
    doc_dist = compute_topic_frequencies(doc_topic_distributions[doc_index])
    
    summary_sentences = []
    # Retrieve the sentences for the document using its index.
    doc_sentences = list(docs_sentences.values())[doc_index]
    
    # Iteratively add sentences to the summary until the desired number is reached.
    while len(summary_sentences) < num_sentences:
        sentence = select_sentence_based_on_topic(doc_sentences, doc_dist, summary_sentences, 
                                                  doc_index, doc_topic_distributions)
        if sentence:
            summary_sentences.append(sentence)
        else:
            break  # Stop if no additional sentence can reduce the divergence
    
    return ' '.join(summary_sentences)


In [41]:
# List of document indices for which to generate summaries.
selected_doc_indices = [8, 3, 5]

# Iterate over each selected document index.
for doc_index in selected_doc_indices:
    # Generate a summary for the current document using 5 sentences.
    # 'kl_summary_20NG' uses the document's topic distribution and the document's sentences
    # to select the most representative sentences based on KL divergence.
    summary = kl_summary_20NG(doc_index, 5, doc_topic_distributions_20ng, docs_sentences_20ng)
    
    # Print information about the current document.
    print(f"Summary for Document Index {doc_index}:\n")
    
    # Optionally, print the full list of preprocessed sentences for context.
    print("Original Document Sentences:")
    for sentence in docs_sentences_20ng[doc_index]:
        print(f" - {sentence}")
    
    # Print the generated summary.
    print("\nGenerated Summary:")
    print(summary)
    print("\n" + "="*80 + "\n")


Summary for Document Index 8:

Original Document Sentences:
 - 


yeah, it's the second one.
 - and i believe that price too.
 - i've been trying
to get a good look at it on the bruin-sabre telecasts, and wow!
 - does it ever
look good.
 - whoever did that paint job knew what they were doing.
 - and given
fuhr's play since he got it, i bet the bruins are wishing he didn't have it:)

Generated Summary:



yeah, it's the second one. and i believe that price too. i've been trying
to get a good look at it on the bruin-sabre telecasts, and wow! does it ever
look good. whoever did that paint job knew what they were doing.


Summary for Document Index 3:

Original Document Sentences:
 - 
think!
 - it's the scsi card doing the dma transfers not the disks...

the scsi card can do dma transfers containing data from any of the scsi devices
it is attached when it wants to.
 - an important feature of scsi is the ability to detach a device.
 - this frees the
scsi bus for other devices.
 - this is ty