In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from heapq import nlargest

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# summarize text word frequency
def summarize_text_frequency(text, context_window_limit):
    
    # divide text into sentences
    sentences = sent_tokenize(text)  
    # divide text into words
    words = word_tokenize(text)  
    words = [word.lower() for word in words if word.lower() not in stopwords.words('english') and word.isalnum()]
# calculate word frequency
    word_freq = FreqDist(words)  

    sentence_scores = {}
    for sentence in sentences:
        score = sum(word_freq[word] for word in word_tokenize(sentence.lower()) if word in word_freq)
        sentence_scores[sentence] = score

    num_sentences = context_window_limit // 20
    top_sentences = nlargest(num_sentences, sentence_scores, key=sentence_scores.get)

    summary = ' '.join(top_sentences)
    summary_words = word_tokenize(summary)
    if len(summary_words) > context_window_limit:
        summary = ' '.join(summary_words[:context_window_limit]) 

    return summary

# calculate similarity matrix for sentences
def compute_similarity_matrix(sentences):
    vectorizer = CountVectorizer().fit_transform(sentences)
    return cosine_similarity(vectorizer)

# summarize text using cosine similarity
def summarize_text_cosine_similarity(text, context_window_limit):
     # Split text into sentences
    sentences = sent_tokenize(text) 
    length = 0
    summary = ""

    similarity_matrix = compute_similarity_matrix(sentences)  

    while length < context_window_limit and length < len(sentences):
        most_similar_index = similarity_matrix[length].argsort()[-1]
        sentence = sentences[most_similar_index]
        summary =summary + sentence + " "
        length = length + len(word_tokenize(sentence))
        # avoid repetition
        similarity_matrix[:, most_similar_index] = 0  

    return summary.strip()




In [24]:
def hierarchical_summarization(text, context_window_limit, method="frequency"):
    sentences = sent_tokenize(text)
    slice_size = max(1, len(sentences) // 5)
    summaries = []
    for start in range(0, len(sentences), slice_size):
        end = min(start + slice_size, len(sentences))
        slice_text = ' '.join(sentences[start:end])
        summary = perform_summarization(slice_text, context_window_limit, method)
        summaries.append(summary)

    final_summary = ' '.join(summaries)
    while len(word_tokenize(final_summary)) > context_window_limit:
        final_summary = perform_summarization(final_summary, context_window_limit, method)

    return final_summary

In [25]:
def perform_summarization(content, context_window_limit, method="frequency"):
    if method == "frequency":
        return summarize_text_frequency(content, context_window_limit)
    elif method == "cosine":
        return summarize_text_cosine_similarity(content, context_window_limit)
    else:
        raise ValueError("Unknown summarization method")

def process_documents(doc1, doc2, context_window_limit, method="frequency"):
    doc1_length = len(word_tokenize(doc1))
    doc2_length = len(word_tokenize(doc2))
    total_length = doc1_length + doc2_length
    doc1_target_length = int((doc1_length / total_length) * context_window_limit)
    doc2_target_length = context_window_limit - doc1_target_length

    summary1 = hierarchical_summarization(doc1, doc1_target_length, method)
    summary2 = hierarchical_summarization(doc2, doc2_target_length, method)

    save_summary("summary1.txt", summary1)
    save_summary("summary2.txt", summary2)

    generate_query(summary1, summary2)

def save_summary(filename, summary):
    if summary:
        with open(filename, "w", encoding="utf-8") as file:
            file.write(summary)

def generate_query(summary1, summary2):
    print("Generating query from summaries...")
    query = f"\nDocument 1 summary: {summary1}\n\nDocument 2 summary: {summary2}"
    print(query)

In [27]:

def main():
    input_text1 = """Data preprocessing involves preparing and cleaning text data so that machines can analyze it. Preprocessing puts data in a workable form and highlights features in the text that an algorithm can work with. There are several ways this can be done, including the following:
Tokenization. Tokenization substitutes sensitive information with nonsensitive information, or a token. Tokenization is often used in payment transactions to protect credit card data.
Stop word removal. Common words are removed from the text, so unique words that offer the most information about the text remain.
Lemmatization and stemming. Lemmatization groups together different inflected versions of the same word. For example, the word "walking" would be reduced to its root form, or stem, "walk" to process.
Part-of-speech tagging. Words are tagged based on which part of speech they correspond to -- such as nouns, verbs or adjectives.
"""
    input_text2 = """Most people are not very familiar with the concept of artificial intelligence (AI). As an illustration, when 1,500 senior business leaders in the United States in 2017 were asked about AI, only 17 percent said they were familiar with it. A number of them were not sure what it was or how it would affect their particular companies. They understood there was considerable potential for altering business processes, but were not clear how AI could be deployed within their own organizations. Despite its widespread lack of familiarity, AI is a technology that is transforming every walk of life. It is a wide-ranging tool that enables people to rethink how we integrate information, analyze data, and use the resulting insights to improve decisionmaking. Our hope through this comprehensive overview is to explain AI to an audience of policymakers, opinion leaders, and interested observers, and demonstrate how AI already is
    altering the world and raising important questions for society, the economy, and governance. In this paper, we discuss novel applications in finance, national security, health care, criminal justice, transportation, and smart cities, and address issues such as data access problems, algorithmic bias, AI ethics and transparency, and legal liability for AI decisions. We contrast the regulatory approaches of the U.S. and European Union, and close by making a number of recommendations for getting the most out of AI while still protecting important human values. In order to maximize AI benefits, we recommend nine steps for going forward: Encourage greater data access for researchers without compromising users’ personal privacy, invest more government funding in unclassified AI research, promote new models of digital education and AI workforce development so employees have the skills needed in the 21st-century economy, create a federal 
    AI advisory committee to make policy recommendations, engage with state and local officials so they enact effective policies, regulate broad AI principles rather than specific algorithms, take bias complaints seriously so AI does not replicate historic injustice, unfairness, or discrimination in data or algorithms, maintain mechanisms for human oversight and control, and penalize malicious AI behavior and promote cybersecurity."""

    context_window_limit = 4000

    process_documents(input_text1, input_text2, context_window_limit, method="frequency")

if __name__ == "__main__":
    main()

Generating query from summaries...

Document 1 summary: Data preprocessing involves preparing and cleaning text data so that machines can analyze it. Preprocessing puts data in a workable form and highlights features in the text that an algorithm can work with. Tokenization substitutes sensitive information with nonsensitive information, or a token. There are several ways this can be done, including the following:
Tokenization. Tokenization is often used in payment transactions to protect credit card data. Stop word removal. Common words are removed from the text, so unique words that offer the most information about the text remain. Lemmatization and stemming. For example, the word "walking" would be reduced to its root form, or stem, "walk" to process. Lemmatization groups together different inflected versions of the same word. Words are tagged based on which part of speech they correspond to -- such as nouns, verbs or adjectives. Part-of-speech tagging.

Document 2 summary: As an il