**TEXT SUMMARIZATION**

> **Extractive summarization**


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
import re
from google.colab import files  # Import the 'files' module for file downloads

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
def read_article(file_content):
    sentences = nltk.sent_tokenize(file_content)
    return sentences

def preprocess_sentence(sentence):
    return re.sub(r'[^a-zA-Z]', ' ', sentence).lower()

In [4]:
def sentence_similarity(sent1, sent2):
    sent1 = nltk.word_tokenize(sent1)
    sent2 = nltk.word_tokenize(sent2)
    all_words = list(set(sent1 + sent2))
    vector1 = [sent1.count(word) for word in all_words]
    vector2 = [sent2.count(word) for word in all_words]
    return 1 - cosine_distance(vector1, vector2)

In [5]:
def build_similarity_matrix(sentences):
    num_sentences = len(sentences)
    similarity_matrix = np.zeros((num_sentences, num_sentences))

    for i in range(num_sentences):
        for j in range(num_sentences):
            if i != j:
                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])

    return similarity_matrix

In [6]:
def generate_summary(file_content):
    stop_words = set(stopwords.words("english"))
    sentences = read_article(file_content)

    # Extractive Summarization
    sentence_similarity_matrix = build_similarity_matrix(sentences)
    sentence_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_graph)
    ranked_sentences = sorted(((scores[i], sentence) for i, sentence in enumerate(sentences)), reverse=True)
    top_sentences = [sentence for score, sentence in ranked_sentences[:5]]  # Summarize into 5 sentences

    summarized_text = " ".join(top_sentences)  # Summarize into a paragraph
    return summarized_text

In [7]:
# User Interaction
print("Text Summarization Tool")
input_choice = input("Choose input method (text/file): ").lower()

# Initialize file_content as an empty string
file_content = ""

if input_choice == "text":
    file_content = input("Enter the text for summarization: ")
elif input_choice == "file":
    # Upload your text file to Google Colab
    uploaded_file = files.upload()

    # Read the uploaded file
    file_content = list(uploaded_file.values())[0].decode("utf-8")
else:
    print("Invalid input method. Please choose 'text' or 'file'.")

try:
    # Generate the summary
    summary = generate_summary(file_content)

    # Print the summary
    print("Summarized Text:\n", summary)

    # Save the summary to a temporary file
    with open("summary.txt", "w") as output_file:
        output_file.write(summary)

    # Provide a link for the user to download the summarized text
    files.download("summary.txt")  # Download the summarized text

except Exception as e:
    print(f"An error occurred: {e}")

Text Summarization Tool
Choose input method (text/file): text
Enter the text for summarization: Batman's origin story features him swearing vengeance against criminals after witnessing the murder of his parents Thomas and Martha as a child, a vendetta tempered with the ideal of justice. He trains himself physically and intellectually, crafts a bat-inspired persona, and monitors the Gotham streets at night.
Summarized Text:
 He trains himself physically and intellectually, crafts a bat-inspired persona, and monitors the Gotham streets at night. Batman's origin story features him swearing vengeance against criminals after witnessing the murder of his parents Thomas and Martha as a child, a vendetta tempered with the ideal of justice.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>