In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import os

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Read the content of an article
def read_article(article_path):
    with open(article_path, 'r', encoding='utf-8', errors='ignore') as file:
        return file.read()

# the path of the article to summarize
article_path = "/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/business/001.txt"
article_text = read_article(article_path)

# Query processing module
def preprocess(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]
    return sentences, words

# Sentence selection module
def score_sentences(sentences, words):
    word_freq = FreqDist(words)
    sentence_scores = {}
    
    for sentence in sentences:
        for word in word_tokenize(sentence):
            if word in word_freq:
                if sentence not in sentence_scores:
                    sentence_scores[sentence] = word_freq[word]
                else:
                    sentence_scores[sentence] += word_freq[word]
    return sentence_scores

# Summarizer module
def generate_summary(query, text, num_sentences=3):
    sentences, words = preprocess(text)
    scores = score_sentences(sentences, words)
    query_words = word_tokenize(query.lower())
    for sentence in scores:
        sentence_words = word_tokenize(sentence.lower())
        score = 0
        for word in query_words:
            if word in sentence_words:
                score += scores[sentence]
        scores[sentence] = score
    summary_sentences = sorted(scores, key=scores.get, reverse=True)[:num_sentences]
    return ' '.join(summary_sentences)

# Main program
if __name__ == "__main__":
    query = "business news"  # Adjust the query as needed
    summary = generate_summary(query, article_text)
    print("\n")
    print("Query:", query)
    print("Summary:", summary)



Query: business news
Summary: But its own internet business, AOL, had has mixed fortunes. Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier. The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales.
