In [1]:
!pip install rouge

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [6]:
#################### SWSW

# Step 1: Text Preprocessing

def chapter_segmentation(document):
    # Split the document into chapters
    chapters = document.split('\n\nChapter ')[1:]
    chapters = ['Chapter ' + chapter for chapter in chapters]
    return chapters

def sentence_segmentation(chapter):
    # Split the chapter into sentences and store their positions
    sentences = chapter.split('. ')
    sentences = [(sentence + '.', i) for i, sentence in enumerate(sentences)]
    return sentences

def word_tokenization(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    return words

def remove_punctuation(words):
    # Remove punctuation from words
    punctuation = ['.', ',', ';', ':', '!', '?', '"', "'"]
    words = [word for word in words if word not in punctuation]
    return words

def remove_stopwords(words):
    # Remove stopwords from words
    stopwords = ['the', 'a', 'an', 'in', 'on', 'of', 'to', 'for', 'by', 'with']
    words = [word for word in words if word.lower() not in stopwords]
    return words

def perform_stemming(words):
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    return stemmed_words

def preprocess_document(document):
    chapters = chapter_segmentation(document)
    processed_sentences = []

    for chapter in chapters:
        sentences = sentence_segmentation(chapter)

        for sentence, position in sentences:
            words = word_tokenization(sentence)
            words = remove_punctuation(words)
            words = remove_stopwords(words)
            stemmed_words = perform_stemming(words)

            processed_sentences.append(Sentence(stemmed_words, position=position))

    return processed_sentences

import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

class Sentence:
    def __init__(self, words, weight=0, topics=None, position=None,
                 topics_lda_swsw=None, topics_lda_isw=None, topics_lda_rsw=None):
        self.words = words
        self.weight = weight
        self.topics = topics
        self.position = position
        self.topics_lda_swsw = topics_lda_swsw
        self.topics_lda_isw = topics_lda_isw
        self.topics_lda_rsw = topics_lda_rsw

def compute_sentence_weights(sentences, window_size):
  # Compute the LDA-SWSW weights for each sentence
    corpus = [' '.join(sentence.words) for sentence in sentences]
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names_out()

    lda = LatentDirichletAllocation(n_components=1)
    lda.fit(X)
    topic_word_distribution = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

    weights = []
    for sentence in sentences:
        num_words = len(sentence.words)
        if num_words <= window_size:
            sentence.weight = 1.0
        else:
            num_windows = num_words - window_size + 1
            window_weights = []
            for i in range(num_windows):
                window = sentence.words[i: i + window_size]
                num_feature_words = len(set(window).intersection(words))
                window_weight = num_feature_words / window_size
                window_weights.append(window_weight)
            sentence.weight = max(window_weights)
        weights.append(sentence.weight)

    # Create a word-to-index dictionary
    word_to_index = {word: index for index, word in enumerate(words)}

    # Assign topics to sentences based on LDA
    for sentence in sentences:
        sentence.topics = [np.argmax(topic_word_distribution[:,word_to_index.get(word, -1)])
                           for word in sentence.words]

    return weights



def calculate_redundancy_rate(selected_sentences):
    # Calculate the redundancy rate of the selected sentences
    all_words = []
    unique_words = set()
    for sentence in selected_sentences:
        all_words.extend(sentence.words)
        unique_words.update(sentence.words)
    num_all_words = len(all_words)
    num_unique_words = len(unique_words)
    redundancy_rate = 1.0 - (num_unique_words / num_all_words)
    return redundancy_rate

def calculate_inclusive_topic_diversity(selected_sentences):
    # Calculate the inclusive topic diversity of the selected sentences
    topics = set()
    for sentence in selected_sentences:
        if sentence.topics:
            topics.update(sentence.topics)
    num_selected_sentences = len(selected_sentences)
    num_topics = len(topics)
    if num_selected_sentences == 0:
        return 0.0
    else:
        return num_topics / num_selected_sentences

def calculate_exclusive_topic_diversity(selected_sentences):
    # Calculate the exclusive topic diversity of the selected sentences
    topics = []
    for sentence in selected_sentences:
        if sentence.topics:
            topics.extend(sentence.topics)
    num_selected_sentences = len(selected_sentences)
    num_topics = len(set(topics))
    if num_selected_sentences == 0:
        return 0.0
    else:
        return num_topics / num_selected_sentences

def select_sentences(sentences, weights, compression_ratio, redundancy_threshold,
                     inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold):
    # Select the top sentences based on the weights and the compression ratio
    num_selected_sentences = max(1, int(compression_ratio * len(sentences)))
    selected_indices = np.argsort(weights)[-num_selected_sentences:]
    selected_indices.sort()
    selected_sentences = [sentences[idx] for idx in selected_indices]

    sorted_indices = np.argsort(weights)[::-1]
    current_summary = []
    current_redundancy_rate = 0.0
    current_inclusive_topic_diversity = 0.0
    current_exclusive_topic_diversity = 0.0

    for idx in sorted_indices:
        sentence = sentences[idx]
        current_summary.append(sentence)
        current_redundancy_rate = calculate_redundancy_rate(current_summary)
        current_inclusive_topic_diversity = calculate_inclusive_topic_diversity(current_summary)
        current_exclusive_topic_diversity = calculate_exclusive_topic_diversity(current_summary)

        if (current_redundancy_rate <= redundancy_threshold and
                current_inclusive_topic_diversity >= inclusive_topic_diversity_threshold and
                current_exclusive_topic_diversity <= exclusive_topic_diversity_threshold):
            selected_indices.append(idx)
            selected_sentences.append(sentence)

        if len(selected_sentences) >= int(compression_ratio * len(sentences)):
            break

    return selected_sentences

def calculate_gist_similarity(sentence1, sentence2):
    # Calculate the Gist similarity between two sentences
    words1 = set(sentence1.words)
    words2 = set(sentence2.words)
    intersection = words1.intersection(words2)
    union = words1.union(words2)
    similarity = len(intersection) / len(union)
    return similarity


def calculate_gist_diversity(selected_sentences):
    # Calculate the gist diversity of the selected sentences
    num_sentences = len(selected_sentences)
    if num_sentences <= 1:
        return 0.0
    else:
        pairwise_gist_scores = []
        for i in range(num_sentences - 1):
            for j in range(i + 1, num_sentences):
                sentence1 = selected_sentences[i]
                sentence2 = selected_sentences[j]
                gist_score = calculate_gist_similarity(sentence1, sentence2)
                pairwise_gist_scores.append(gist_score)
        mean_gist_score = np.mean(pairwise_gist_scores)
        return mean_gist_score

def calculate_retention_ratio(selected_sentences, original_document):
    # Calculate the retention ratio of the selected sentences
    selected_words = set()
    for sentence in selected_sentences:
        selected_words.update(sentence.words)
    original_words = set()
    for sentence in original_document:
        original_words.update(sentence.words)
    num_selected_words = len(selected_words)
    num_original_words = len(original_words)
    if num_original_words == 0:
        return 0.0
    else:
        return num_selected_words / num_original_words

def generate_summary(sentences):
    # Generate the summary by concatenating the selected sentences
    summary = ' '.join([' '.join(sentence.words) for sentence in sentences])
    return summary

def summarize_document(document, window_size, compression_ratio, redundancy_threshold,
                       inclusive_topic_diversity_threshold, exclusive_topic_diversity_threshold):
    processed_document = preprocess_document(document)
    sentence_weights = compute_sentence_weights(processed_document, window_size)
    selected_sentences = select_sentences(processed_document, sentence_weights, compression_ratio,
                                           redundancy_threshold, inclusive_topic_diversity_threshold,
                                           exclusive_topic_diversity_threshold)
    summary = generate_summary(selected_sentences)
    return summary, selected_sentences

# Example Usage

document = """
Chapter 1
Introduction
This is the first chapter of the document. It provides an overview of the topic and introduces key concepts.

Chapter 2
Literature Review
In this chapter, we review existing literature on the topic. We discuss various studies and their findings.

Chapter 3
Methodology
This chapter describes the methodology used in the research. It explains the data collection process and the analytical techniques employed.

Chapter 4
Results and Analysis
Here, we present the results of our research and analyze them in detail. We discuss the implications and draw conclusions based on the findings.

Chapter 5
Conclusion
The final chapter summarizes the key points discussed in the document and offers recommendations for future research.
"""

reference = """
Chapter 1 provides an introduction and overview of the topic, while
Chapter 2 focuses on reviewing existing literature.
Chapter 3 describes the methodology used in the research, including data collection and analytical techniques.
Chapter 4 presents the research results and analysis, discussing implications and conclusions. Finally,
Chapter 5 summarizes the key points and offers recommendations for future research.
"""

window_size = 8
compression_ratio = 0.3
redundancy_threshold = 0.3
inclusive_topic_diversity_threshold = 0.4
exclusive_topic_diversity_threshold = 0.2

summary, selected_sentences = summarize_document(document, window_size, compression_ratio,
                                                 redundancy_threshold, inclusive_topic_diversity_threshold,
                                                 exclusive_topic_diversity_threshold)

print("Summary:")
print(summary)

print("\nSelected Sentences:")
for sentence in selected_sentences:
    print(sentence.words)

from rouge import Rouge

def calculate_rouge_scores(summary, reference):
    rouge = Rouge()
    scores = rouge.get_scores(summary, reference)[0]
    precision = scores['rouge-1']['p']
    recall = scores['rouge-1']['r']
    f_score = scores['rouge-1']['f']
    return precision, recall, f_score


precision, recall, f_score = calculate_rouge_scores(summary, reference)
gist_diversity = calculate_gist_diversity(selected_sentences)
retention_ratio = calculate_retention_ratio(selected_sentences, preprocess_document(document))

print("\nROUGE Scores:")
print("Precision:", precision)
print("Recall:", recall)
print("F-score:", f_score)

print("Gist Diversity:", gist_diversity)
print("Retention Ratio:", retention_ratio)


Summary:
we discuss implic and draw conclus base findings.. chapter 5 conclus final chapter summar key point discuss document and offer recommend futur research.

Selected Sentences:
['we', 'discuss', 'implic', 'and', 'draw', 'conclus', 'base', 'findings..']
['chapter', '5', 'conclus', 'final', 'chapter', 'summar', 'key', 'point', 'discuss', 'document', 'and', 'offer', 'recommend', 'futur', 'research.']

ROUGE Scores:
Precision: 0.21052631578947367
Recall: 0.08695652173913043
F-score: 0.12307691893964512
Gist Diversity: 0.15789473684210525
Retention Ratio: 0.37254901960784315
