In [1]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
################## RSW

import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

class Sentence:
    def __init__(self, words, weight=0, topics=None, position=None,
                 topics_lda_swsw=None, topics_lda_isw=None, topics_lda_rsw=None):
        self.words = words
        self.weight = weight
        self.topics = topics
        self.position = position
        self.topics_lda_swsw = topics_lda_swsw
        self.topics_lda_isw = topics_lda_isw
        self.topics_lda_rsw = topics_lda_rsw

def preprocess_document(document):
    # Split the document into sentences
    sentences = document.split('\n')
    # Remove empty lines and leading/trailing whitespaces
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # Tokenize each sentence into words
    processed_document = [Sentence(sentence.split()) for sentence in sentences]
    return processed_document

def compute_topic_words(document, num_topics, num_iterations):
    # Compute the topic words using LDA
    sentences = [' '.join(sentence.words) for sentence in document]
    vectorizer = CountVectorizer()
    document_vectors = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42, n_jobs=-1)
    lda.fit(document_vectors)
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topic_words.append(top_words)
    return topic_words

def compute_sentence_topic_word_frequency(sentences, topic_words):
    # Compute the topic-word frequency for each sentence
    for sentence in sentences:
        sentence.topics_lda_rsw = [0] * len(topic_words)
        for word in sentence.words:
            for topic_idx, topic in enumerate(topic_words):
                if word in topic:
                    sentence.topics_lda_rsw[topic_idx] += 1

def compute_sentence_weights(sentences):
    # Compute the LDA-RSW weights for each sentence
    weights = [sum(sentence.topics_lda_rsw) for sentence in sentences]
    total_weight = sum(weights)
    for sentence in sentences:
        sentence.weight = sentence.weight / total_weight if total_weight > 0 else 0
    return weights

def compute_redundancy_rate(selected_sentences):
    # Compute the redundancy rate of selected sentences
    total_words = sum([len(sentence.words) for sentence in selected_sentences])
    unique_words = len(set([word for sentence in selected_sentences for word in sentence.words]))

    redundancy_rate = 0
    if total_words > 0:
        redundancy_rate = (total_words - unique_words) / total_words

    return redundancy_rate

def compute_inclusive_topic_diversity(selected_sentences):
    # Compute the inclusive topic diversity of selected sentences
    topic_indices = set()
    for sentence in selected_sentences:
        if sentence.topics:
            topic_indices.update(sentence.topics)
    inclusive_topic_diversity = len(topic_indices) / len(selected_sentences)
    return inclusive_topic_diversity


def compute_exclusive_topic_diversity(selected_sentences):
    # Compute the exclusive topic diversity of selected sentences
    topic_counts = [0] * len(selected_sentences[0].topics_lda_rsw)
    for sentence in selected_sentences:
        for topic_idx, topic_count in enumerate(sentence.topics_lda_rsw):
            if topic_count > 0:
                topic_counts[topic_idx] += 1
    exclusive_topic_diversity = np.mean(topic_counts) / len(selected_sentences)
    return exclusive_topic_diversity

def select_sentences(sentences, weights, compression_ratio, redundancy_threshold, inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold):
    # Sort the sentences based on the weights
    num_selected_sentences = max(1, int(compression_ratio * len(sentences)))
    selected_indices = np.argsort(weights)[-num_selected_sentences:]
    selected_indices.sort()
    selected_sentences = [sentences[idx] for idx in selected_indices]

    sorted_indices = np.argsort(weights)
    sorted_indices = sorted_indices.tolist()  # Convert the NumPy array to a Python list
    sorted_indices.reverse()

    while len(selected_sentences) < num_selected_sentences and sorted_indices:
        idx = sorted_indices.pop(0)
        selected_indices.append(idx)
        selected_sentences.append(sentences[idx])

        # Apply redundancy, inclusive topic diversity, and exclusive topic diversity checks
        if len(selected_sentences) > 1:
            # redundant = check_redundancy(selected_sentences, redundancy_threshold)
            redundant = compute_redundancy_rate(selected_sentences)
            inclusive_topic_diversity = compute_inclusive_topic_diversity(selected_sentences)
            exclusive_topic_diversity = compute_exclusive_topic_diversity(selected_sentences)

            if redundant or inclusive_topic_diversity < inclusive_topic_diversity_threshold or exclusive_topic_diversity > exclusive_topic_diversity_threshold:
                selected_indices.pop(0)
                selected_sentences.pop(0)

    return selected_sentences


def calculate_retention_ratio(selected_sentences, document):
    # Calculate the retention ratio of the selected sentences
    selected_words = set([word for sentence in selected_sentences for word in sentence.words])
    total_words = set([word for sentence in document for word in sentence.words])
    retention_ratio = len(selected_words) / len(total_words)
    return retention_ratio

def calculate_gist_diversity(selected_sentences):
    # Calculate the gist diversity of the selected sentences
    sentence_lengths = [len(sentence.words) for sentence in selected_sentences]
    avg_length = np.mean(sentence_lengths)
    deviation = np.sqrt(np.mean((sentence_lengths - avg_length) ** 2))
    gist_diversity = deviation / avg_length
    return gist_diversity

def generate_summary(sentences):
    # Generate the summary by concatenating the selected sentences
    summary = ' '.join([' '.join(sentence.words) for sentence in sentences])
    return summary

def smooth_summary(selected_sentences, num_iterations):
    # Smooth the summary by iteratively adding highly diverse and important sentences
    summary = []
    for _ in range(num_iterations):
        summary.extend(selected_sentences)
        topic_words = compute_topic_words(summary, num_topics, num_iterations)
        compute_sentence_topic_word_frequency(selected_sentences, topic_words)
        sentence_weights = compute_sentence_weights(selected_sentences)
        selected_sentences = select_sentences(selected_sentences, sentence_weights, 1.0, 0.0, 0.0, 1.0)
    return summary

def summarize_document(document, num_topics, num_iterations, compression_ratio,
                       redundancy_threshold, inclusive_topic_diversity_threshold,
                       exclusive_topic_diversity_threshold, smoothing_iterations):
    processed_document = preprocess_document(document)
    topic_words = compute_topic_words(processed_document, num_topics, num_iterations)
    compute_sentence_topic_word_frequency(processed_document, topic_words)
    sentence_weights = compute_sentence_weights(processed_document)
    selected_sentences = select_sentences(processed_document, sentence_weights, compression_ratio,
                                           redundancy_threshold, inclusive_topic_diversity_threshold,
                                           exclusive_topic_diversity_threshold)
    summary = generate_summary(selected_sentences)
    smoothed_summary = smooth_summary(selected_sentences, smoothing_iterations)
    gist_diversity = calculate_gist_diversity(selected_sentences)
    retention_ratio = calculate_retention_ratio(selected_sentences, processed_document)
    return summary, smoothed_summary, gist_diversity, retention_ratio

# Example Usage

document = """
Chapter 1
Introduction
This is the first chapter of the document. It provides an overview of the topic and introduces key concepts.

Chapter 2
Literature Review
In this chapter, we review existing literature on the topic. We discuss various studies and their findings.

Chapter 3
Methodology
This chapter describes the methodology used in the research. It explains the data collection process and the analytical techniques employed.

Chapter 4
Results and Analysis
Here, we present the results of our research and analyze them in detail. We discuss the implications and draw conclusions based on the findings.

Chapter 5
Conclusion
The final chapter summarizes the key points discussed in the document and offers recommendations for future research.
"""

num_topics = 5
num_iterations = 100
compression_ratio = 0.3
redundancy_threshold = 0.2
inclusive_topic_diversity_threshold = 0.6
exclusive_topic_diversity_threshold = 0.4
smoothing_iterations = 5

summary, smoothed_summary, gist_diversity, retention_ratio = summarize_document(document, num_topics, num_iterations,
                                                                                 compression_ratio, redundancy_threshold,
                                                                                 inclusive_topic_diversity_threshold,
                                                                                 exclusive_topic_diversity_threshold,
                                                                                 smoothing_iterations)

print("Summary:")
print(summary)

print("\nGist Diversity:", gist_diversity)
print("Retention Ratio:", retention_ratio)

from rouge import Rouge

# Function to calculate precision, recall, and F-score for ROUGE scores
def calculate_rouge_metrics(summary, reference):
    rouge = Rouge()
    scores = rouge.get_scores(summary, reference)
    precision = scores[0]['rouge-1']['p']
    recall = scores[0]['rouge-1']['r']
    f_score = scores[0]['rouge-1']['f']
    return precision, recall, f_score

reference = """
Chapter 1 provides an introduction and overview of the topic, while
Chapter 2 focuses on reviewing existing literature.
Chapter 3 describes the methodology used in the research, including data collection and analytical techniques.
Chapter 4 presents the research results and analysis, discussing implications and conclusions. Finally,
Chapter 5 summarizes the key points and offers recommendations for future research.
"""
# Calculate ROUGE metrics
precision, recall, f_score = calculate_rouge_metrics(summary, reference)

# Print the metrics
print()
print("Precision:", precision)
print("Recall:", recall)
print("F-score:", f_score)

Summary:
This is the first chapter of the document. It provides an overview of the topic and introduces key concepts. This chapter describes the methodology used in the research. It explains the data collection process and the analytical techniques employed. Here, we present the results of our research and analyze them in detail. We discuss the implications and draw conclusions based on the findings. The final chapter summarizes the key points discussed in the document and offers recommendations for future research.

Gist Diversity: 0.1274754878398196
Retention Ratio: 0.7051282051282052

Precision: 0.49056603773584906
Recall: 0.5652173913043478
F-score: 0.5252525202775227
