In [19]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from heapq import nlargest


In [20]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\navid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def summarize_text(text, context_window_limit):
    # tokenize  text into sentences
    sentences = sent_tokenize(text)
    
    # tokenize the text into words
    words = word_tokenize(text)
    
    # remove stopwords and non-alphanumeric words
    words = [word.lower() for word in words if word.lower() not in stopwords.words('english') and word.isalnum()]


    # calculate the frequency of each word
    word_freq = FreqDist(words)
    
    # calculate the score of each sentence based on word frequency
    sentence_scores = {}
    for sentence in sentences:
        sentence_word_count = 0
        sentence_score = 0
        for word in word_tokenize(sentence.lower()):
            if word in word_freq:
                sentence_word_count += 1
                sentence_score += word_freq[word]
        if sentence_word_count > 0:
            sentence_scores[sentence] = sentence_score / sentence_word_count
    
    # Determine the target number of sentences based on sentence length
    target_num_sentences = int(context_window_limit / 20) 
    
    # get the top sentences based on score
    top_sentences = nlargest(target_num_sentences, sentence_scores, key=sentence_scores.get)
    summary = ' '.join(top_sentences)
    
    # Ensure the summary does not exceed the context window limit
    summary_words = word_tokenize(summary)
    if len(summary_words) > context_window_limit:
        summary = ' '.join(summary_words[:context_window_limit])
    
    return summary

In [23]:
# example usage
input_text = """
Geography (from Ancient Greek γεωγραφία geōgraphía; combining gê 'Earth' and gráphō 'write') is the study of the lands, features, inhabitants, and phenomena of Earth.[1] Geography is an all-encompassing discipline that seeks an understanding of Earth and its human and natural complexities—not merely where objects are, but also how they have changed and come to be. While geography is specific to Earth, many concepts can be applied more broadly to other celestial bodies in the field of planetary science.[2] Geography has been called "a bridge between natural science and social science disciplines."""
context_window_limit = 50   
summary = summarize_text(input_text, context_window_limit)
print(summary)


[2] Geography has been called "a bridge between natural science and social science disciplines. While geography is specific to Earth, many concepts can be applied more broadly to other celestial bodies in the field of planetary science.
