In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

nltk.download('punkt')
nltk.download('stopwords')

# Sample document
document = """
Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language. NLP technologies are used to process, analyze, and understand large amounts of natural language data.

One of the primary applications of NLP is sentiment analysis, which determines the sentiment or emotional tone of a piece of text. Sentiment analysis is widely used in social media monitoring, customer feedback analysis, and brand reputation management.

Text summarization is another important NLP task. Extractive summarization involves selecting a subset of sentences from a text to create a shorter version that retains the most critical information. Abstractive summarization, on the other hand, generates a summary by paraphrasing and rephrasing the original text. The extractive summarization method typically involves the following steps:

1. Sentence Tokenization: Divide the text into individual sentences.

2. Text Preprocessing: Remove stopwords and punctuation, and convert words to lowercase.

3. Calculate Sentence Scores: Assign scores to sentences based on their importance.

4. Select Top Sentences: Choose sentences with the highest scores to form the summary.
"""

# Tokenize sentences
sentences = sent_tokenize(document)

# Tokenize words and remove stopwords
stop_words = set(stopwords.words("english"))
words = word_tokenize(document.lower())
filtered_words = [word for word in words if word.isalnum() and word not in stop_words]

# Calculate word frequency
word_freq = FreqDist(filtered_words)

# Calculate sentence scores based on word frequency
sentence_scores = {}
for sentence in sentences:
    for word in word_tokenize(sentence.lower()):
        if word in word_freq:
            if sentence not in sentence_scores:
                sentence_scores[sentence] = word_freq[word]
            else:
                sentence_scores[sentence] += word_freq[word]

# Get the 3 sentences with the highest scores for summary
num_sentences_in_summary = 3
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[:num_sentences_in_summary]

# Print the summary
print("Extractive Summary:")
for sentence in summary_sentences:
    print(sentence.strip())


Extractive Summary:
Extractive summarization involves selecting a subset of sentences from a text to create a shorter version that retains the most critical information.
One of the primary applications of NLP is sentiment analysis, which determines the sentiment or emotional tone of a piece of text.
Natural language processing (NLP) is a subfield of artificial intelligence (AI) that focuses on the interaction between computers and humans through natural language.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
