In [2]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [3]:
################# ISW


import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer


class Sentence:
    def __init__(self, words, weight=0, topics=None, position=None,
                 topics_lda_swsw=None, topics_lda_isw=None, topics_lda_rsw=None):
        self.words = words
        self.weight = weight
        self.topics = topics
        self.position = position
        self.topics_lda_swsw = topics_lda_swsw
        self.topics_lda_isw = topics_lda_isw
        self.topics_lda_rsw = topics_lda_rsw

def preprocess_document(document):
    # Split the document into sentences
    sentences = document.split('\n')
    # Remove empty lines and leading/trailing whitespaces
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    # Tokenize each sentence into words
    processed_document = [Sentence(sentence.split()) for sentence in sentences]
    return processed_document

def compute_topic_words(document, num_topics, num_iterations):
    # Compute the topic words using LDA
    sentences = [' '.join(sentence.words) for sentence in document]
    vectorizer = CountVectorizer()
    document_vectors = vectorizer.fit_transform(sentences)
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42, n_jobs=-1)
    lda.fit(document_vectors)
    feature_names = vectorizer.get_feature_names_out()
    topic_words = []
    for topic_idx, topic in enumerate(lda.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-11:-1]]
        topic_words.append(top_words)
    return topic_words


def compute_sentence_weights(sentences, topic_words):
    # Compute the LDA-ISW weights for each sentence
    weights = []
    total_topic_word_frequency = sum(len(words) for words in topic_words)  # Total frequency of all topic words
    for sentence in sentences:
        sentence_weight = sum(word in topic_words for word in sentence.words) / total_topic_word_frequency
        weights.append(sentence_weight)
    return weights

def compute_redundancy_rate(selected_sentences):
    # Compute the redundancy rate of the selected sentences
    unique_sentences = set([sentence.words for sentence in selected_sentences])
    return 1 - (len(unique_sentences) / len(selected_sentences))


def compute_inclusive_topic_diversity(selected_sentences):
    # Compute the inclusive topic diversity of the selected sentences
    unique_topics = set([topic for sentence in selected_sentences for topic in sentence.topics])
    return len(unique_topics) / len(selected_sentences)


def compute_exclusive_topic_diversity(selected_sentences):
     # Compute the exclusive topic diversity of the selected sentences
    unique_topics = set([topic for sentence in selected_sentences for topic in sentence.topics])
    total_topics = set([topic for sentence in selected_sentences for topic in sentence.topics_lda_isw])
    return len(unique_topics) / len(total_topics)

def select_sentences(processed_document, sentence_weights, compression_ratio, redundancy_threshold,
                     inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold):

    num_selected_sentences = max(1, int(compression_ratio * len(processed_document)))
    selected_indices = np.argsort(sentence_weights)[-num_selected_sentences:]
    selected_indices.sort()
    selected_sentences = [processed_document[idx] for idx in selected_indices]

    num_sentences = len(processed_document)
    sorted_indices = np.argsort(sentence_weights)[::-1]
    # selected_sentences = []
    cumulative_length = 0
    for index in sorted_indices:
        sentence = processed_document[index]
        if (cumulative_length + len(sentence.words)) / num_sentences > compression_ratio:
            break
        if compute_redundancy_rate(selected_sentences + [sentence]) <= redundancy_threshold \
                and compute_inclusive_topic_diversity(selected_sentences + [sentence]) >= inclusive_topic_diversity_threshold \
                and compute_exclusive_topic_diversity(selected_sentences + [sentence]) <= exclusive_topic_diversity_threshold:
            selected_sentences.append(sentence)
            cumulative_length += len(sentence.words)
    return selected_sentences


def generate_summary(selected_sentences, smoothing_factor):
    # Sort the selected sentences based on their position
    sorted_sentences = sorted(selected_sentences, key=lambda x: x.position if x.position is not None else float('inf'))
    # Apply summary smoothing
    smoothed_sentences = []
    prev_weight = None
    for sentence in sorted_sentences:
        if prev_weight is None or sentence.weight >= smoothing_factor * prev_weight:
            smoothed_sentences.append(sentence)
            prev_weight = sentence.weight
    # Generate the summary by joining the words of the selected sentences
    summary = ' '.join([' '.join(sentence.words) for sentence in smoothed_sentences])
    return summary


def calculate_retention_ratio(selected_sentences, processed_document):
    # Calculate the retention ratio of the selected sentences
    num_selected_words = sum(len(sentence.words) for sentence in selected_sentences)
    num_total_words = sum(len(sentence.words) for sentence in processed_document)
    return num_selected_words / num_total_words


def calculate_gist_diversity(selected_sentences):
    # Calculate the gist diversity of the selected sentences
    num_selected_words = sum(len(sentence.words) for sentence in selected_sentences)
    num_unique_words = len(set([word for sentence in selected_sentences for word in sentence.words]))
    return num_unique_words / num_selected_words


def summarize_document(document, num_topics, num_iterations, compression_ratio, redundancy_threshold,
                       inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold,
                       summary_smoothing_factor):
    processed_document = preprocess_document(document)
    topic_words = compute_topic_words(processed_document, num_topics, num_iterations)
    sentence_weights = compute_sentence_weights(processed_document, topic_words)
    selected_sentences = select_sentences(processed_document, sentence_weights, compression_ratio,
                                           redundancy_threshold, inclusive_topic_diversity_threshold,
                                           exclusive_topic_diversity_threshold)
    retention_ratio = calculate_retention_ratio(selected_sentences, processed_document)
    gist_diversity = calculate_gist_diversity(selected_sentences)
    summary = generate_summary(selected_sentences, summary_smoothing_factor)
    return summary, selected_sentences, retention_ratio, gist_diversity


# Example Usage

document = """
Chapter 1
Introduction
This is the first chapter of the document. It provides an overview of the topic and introduces key concepts.

Chapter 2
Literature Review
In this chapter, we review existing literature on the topic. We discuss various studies and their findings.

Chapter 3
Methodology
This chapter describes the methodology used in the research. It explains the data collection process and the analytical techniques employed.

Chapter 4
Results and Analysis
Here, we present the results of our research and analyze them in detail. We discuss the implications and draw conclusions based on the findings.

Chapter 5
Conclusion
The final chapter summarizes the key points discussed in the document and offers recommendations for future research.
"""

num_topics = 5
num_iterations = 100
compression_ratio = 0.3
redundancy_threshold = 0.3
inclusive_topic_diversity_threshold = 0.5
exclusive_topic_diversity_threshold = 0.2
summary_smoothing_factor = 0.8

summary, selected_sentences, retention_ratio, gist_diversity = summarize_document(
    document, num_topics, num_iterations, compression_ratio, redundancy_threshold,
    inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold, summary_smoothing_factor
)

print("Summary:")
print(summary)

print("\nSelected Sentences:")
for sentence in selected_sentences:
    print(sentence.words)

print("\nRetention Ratio:", retention_ratio)
print("Gist Diversity:", gist_diversity)

from rouge import Rouge

def compute_rouge_scores(summary, reference):
    rouge = Rouge()
    scores = rouge.get_scores(summary, reference)[0]
    precision = scores['rouge-1']['p']
    recall = scores['rouge-1']['r']
    f_score = scores['rouge-1']['f']
    return precision, recall, f_score

reference = """
Chapter 1 provides an introduction and overview of the topic, while
Chapter 2 focuses on reviewing existing literature.
Chapter 3 describes the methodology used in the research, including data collection and analytical techniques.
Chapter 4 presents the research results and analysis, discussing implications and conclusions. Finally,
Chapter 5 summarizes the key points and offers recommendations for future research.
"""

precision, recall, f_score = compute_rouge_scores(summary, reference)

print("ROUGE Scores:")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F-score: {f_score}")


Summary:
Here, we present the results of our research and analyze them in detail. We discuss the implications and draw conclusions based on the findings. Chapter 5 Conclusion The final chapter summarizes the key points discussed in the document and offers recommendations for future research.

Selected Sentences:
['Here,', 'we', 'present', 'the', 'results', 'of', 'our', 'research', 'and', 'analyze', 'them', 'in', 'detail.', 'We', 'discuss', 'the', 'implications', 'and', 'draw', 'conclusions', 'based', 'on', 'the', 'findings.']
['Chapter', '5']
['Conclusion']
['The', 'final', 'chapter', 'summarizes', 'the', 'key', 'points', 'discussed', 'in', 'the', 'document', 'and', 'offers', 'recommendations', 'for', 'future', 'research.']

Retention Ratio: 0.3826086956521739
Gist Diversity: 0.8409090909090909
ROUGE Scores:
Precision: 0.5
Recall: 0.391304347826087
F-score: 0.439024385318263
