In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from heapq import nlargest


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def summarize_text(text, context_window_limit):
    # tokenize  text into sentences
    sentences = sent_tokenize(text)
    
    # tokenize the text into words
    words = word_tokenize(text)
    
    # remove stopwords and non-alphanumeric words
    words = [word.lower() for word in words if word.lower() not in stopwords.words('english') and word.isalnum()]


    # calculate the frequency of each word
    word_freq = FreqDist(words)
    
    # calculate the score of each sentence based on word frequency
    sentence_scores = {}
    
    for sentence in sentences:
        sentence_word_count = 0
        sentence_score = 0
        
        for word in word_tokenize(sentence.lower()):
            
            if word in word_freq:
                sentence_word_count += 1
                sentence_score += word_freq[word]
                
        if sentence_word_count > 0:
            sentence_scores[sentence] = sentence_score / sentence_word_count
    
    # Determine the target number of sentences based on sentence length
    target_num_sentences = int(context_window_limit / 20) 
    
    # get the top sentences based on score
    top_sentences = nlargest(target_num_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(top_sentences)
    
    summary_words = word_tokenize(summary)
    # ensure the summary does not exceed the context window limit

    if len(summary_words) > context_window_limit:
        summary = ' '.join(summary_words[:context_window_limit])
        
    
    return summary

In [4]:
def hieracial_summarization(text, context_window_limit):
    slice_size = context_window_limit // 2
    summaries = []
    start = 0
    
    while start < len(text):
        end = start + slice_size
        slice_text = text[start:end]
        summary = summarize_text(slice_text, context_window_limit)
        summaries.append(summary)
        start = end
    final_summary = ' '.join(summaries)
    
    
    while len(word_tokenize(final_summary)) > context_window_limit:
        final_summary = summarize_text(final_summary, context_window_limit)
    return final_summary

In [8]:
def process_documents(doc1, doc2, context_window_limit):
    doc1_length = len(word_tokenize(doc1))
    doc2_length = len(word_tokenize(doc2))
    total_length = doc1_length + doc2_length
    doc1_target_length = int((doc1_length / total_length) * context_window_limit)
    doc2_target_length = context_window_limit - doc1_target_length

    summary1 = hieracial_summarization(doc1, doc1_target_length)
    summary2 = hieracial_summarization(doc2, doc2_target_length)


    # assuming 'generate_query' is a placeholder for actual query generation logic
    generate_query(summary1, summary2)
    
    
    with open("summary1.txt", "w", encoding="utf-8") as file:
        file.write(summary1)
    with open("summary2.txt", "w", encoding="utf-8") as file:
        file.write(summary2)

In [9]:
def generate_query(summary1, summary2):
    # Placeholder query based on summaries
    print("Generating query from summaries...")



In [10]:
# Example usage:
doc1 = """
Once the who has ceased to be a mystery, the how is all we have. On the one hand this has been a classic City kick for the line. After the back-to-back draws against Liverpool and Arsenal as March turned into April, results that even at the time felt a missed opportunity for the challengers to open up clear water, City have won eight in a row, scoring 30 and conceding five.
If there was one moment that summed up that run, it came in the win over Wolves when within a minute of Hwang Hee-chan pulling one back to make it 3-1 and raise the vaguest possibility of a comeback Erling Haaland took a long pass from Phil Foden, stepped inside Max Kilman and bullied a shot into the top corner. It was simple, direct and brutally efficient and it admitted not a glimmer of hope for anybody else: don’t even think about taking this title from us, it said.
The oddity over the past few weeks is that there have been quite a few moments like that. What if Emi Martínez rather than Robin Olsen had played? What if Chris Wood or Murillo had taken their chances? What if City hadn’t been given that questionable early penalty against Wolves? What if Son Heung-min had scored that one-on-one on Tuesday?
Nottingham Forest created a higher xG in the first half against City than any other side had in the league this season; against Spurs, City created a lower xG in the first half than in any other league game this season. It sounds absurd to say it but even after they thrashed Wolves there was a sense City had not been entirely convincing. Haaland’s fourth killed the game and City had an extremely comfortable final half-hour but there were occasions early on when they looked vulnerable.
Most of this is ludicrous. City are unbeaten in 34 games in all competitions. They have won their past eight in the league by at least two goals. The past six weeks have been an emphatic assertion of pre-eminence. Even if they have won only two games against top-six teams, the talk with any other side would be of a crushing surge of form at just the right time.
But this is not any other side: it is City, the perennial champions, and so the discussion around them necessarily descends into nit-picking, just as somebody who dines in only the finest restaurants ends up quibbling about a filigree crack in the pastry or a marginal imbalance in a sauce when all the food is sensational.
But there is something slightly different about this City. The control that once characterised Guardiola sides is not there. They have been more vulnerable to breakaways this season; their counterpress is not functioning as well. Although their possession has remained almost unchanged at 65% (an enormous number in historical terms), they have not seemed as coherent as in past seasons.
Perhaps that’s another issue of familiarity: the tendency is to compare a team with its best self, but City have felt this season more reliant on brilliant individuals than in the past.
"""
doc2 = """
Liverpool's resurgence has been one of the stories of the season. After a slow start, the Reds have found their stride, led by the indomitable Mohamed Salah. His partnership with new signing Darwin Nunez has blossomed, yielding goals aplenty and putting Liverpool back in contention for the top four. Jurgen Klopp’s men have shown resilience, turning deficits into victories and showcasing a never-say-die attitude reminiscent of their title-winning campaign a few seasons ago.
A key moment came in the match against Tottenham Hotspur, where a stunning late goal from Salah secured all three points in a game that seemed destined for a draw. This match was a microcosm of Liverpool’s season – moments of defensive frailty overshadowed by attacking brilliance. The team’s ability to score goals from seemingly impossible situations has been a hallmark of their play this season.
Questions remain about Liverpool’s defense, with injuries to key players like Virgil van Dijk causing instability at the back. However, the emergence of young talents like Curtis Jones and Harvey Elliott has provided much-needed depth and dynamism in midfield. Their performances have been a bright spot, giving fans hope for the future.
The tactical tweaks made by Klopp have also been pivotal. Switching to a more fluid 4-2-3-1 formation has allowed Liverpool to maximize their attacking potential while providing greater defensive solidity. This flexibility in approach has been crucial in navigating a congested fixture schedule, ensuring the team remains competitive on all fronts.
As the season progresses, Liverpool’s ability to maintain this level of performance will be tested. The squad depth, combined with Klopp’s tactical acumen, will be key factors in determining whether they can sustain their challenge for silverware. With key players returning from injury and the January transfer window offering opportunities to strengthen further, Liverpool fans have every reason to be optimistic about the remainder of the season.
"""
context_window_limit = 4000
process_documents(doc1, doc2, context_window_limit)

Generating query from summaries...
