# Text summarization using text extraction

### There are two main approaches to automatically summarize the text 
- Abstractive
- Extractive 
<br />
The main difference between them is how information is extracted from the document and how the summary is generated.

### <i>Let's get started</i>

In [1]:
text = """
In today's fast-paced world, technology plays a crucial role in every aspect of our lives. From communication to transportation, healthcare to entertainment, technology has revolutionized the way we live, work, and interact with the world around us. With the advent of artificial intelligence, machine learning, and big data analytics, we are witnessing unprecedented advancements in various fields. However, along with the benefits come challenges such as cybersecurity threats, privacy concerns, and ethical dilemmas. As we navigate this digital age, it's essential to harness the power of technology responsibly, ensuring that it serves the greater good while respecting individual rights and societal values. By leveraging innovation and collaboration, we can address these challenges and create a brighter future for generations to come. """

In [2]:
# !pip install -U spacy
# !python -m spacy download en_core_web_sm

In [3]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [4]:
stopwords = list(STOP_WORDS)

In [5]:
nlp = spacy.load('en_core_web_sm')

In [6]:
document = nlp(text)

In [7]:
tokens = [token.text for token in document]
print(tokens)

['\n', 'In', 'today', "'s", 'fast', '-', 'paced', 'world', ',', 'technology', 'plays', 'a', 'crucial', 'role', 'in', 'every', 'aspect', 'of', 'our', 'lives', '.', 'From', 'communication', 'to', 'transportation', ',', 'healthcare', 'to', 'entertainment', ',', 'technology', 'has', 'revolutionized', 'the', 'way', 'we', 'live', ',', 'work', ',', 'and', 'interact', 'with', 'the', 'world', 'around', 'us', '.', 'With', 'the', 'advent', 'of', 'artificial', 'intelligence', ',', 'machine', 'learning', ',', 'and', 'big', 'data', 'analytics', ',', 'we', 'are', 'witnessing', 'unprecedented', 'advancements', 'in', 'various', 'fields', '.', 'However', ',', 'along', 'with', 'the', 'benefits', 'come', 'challenges', 'such', 'as', 'cybersecurity', 'threats', ',', 'privacy', 'concerns', ',', 'and', 'ethical', 'dilemmas', '.', 'As', 'we', 'navigate', 'this', 'digital', 'age', ',', 'it', "'s", 'essential', 'to', 'harness', 'the', 'power', 'of', 'technology', 'responsibly', ',', 'ensuring', 'that', 'it', 'se

In [8]:
punctuation = punctuation + '\n'
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n'

In [9]:
word_frequrncies = {}
for word in document:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text.lower() not in word_frequrncies:
                word_frequrncies[word.text.lower()] = 1
            else:
                word_frequrncies[word.text.lower()] = word_frequrncies[word.text.lower()] + 1

In [10]:
word_frequrncies

{'today': 1,
 'fast': 1,
 'paced': 1,
 'world': 2,
 'technology': 3,
 'plays': 1,
 'crucial': 1,
 'role': 1,
 'aspect': 1,
 'lives': 1,
 'communication': 1,
 'transportation': 1,
 'healthcare': 1,
 'entertainment': 1,
 'revolutionized': 1,
 'way': 1,
 'live': 1,
 'work': 1,
 'interact': 1,
 'advent': 1,
 'artificial': 1,
 'intelligence': 1,
 'machine': 1,
 'learning': 1,
 'big': 1,
 'data': 1,
 'analytics': 1,
 'witnessing': 1,
 'unprecedented': 1,
 'advancements': 1,
 'fields': 1,
 'benefits': 1,
 'come': 2,
 'challenges': 2,
 'cybersecurity': 1,
 'threats': 1,
 'privacy': 1,
 'concerns': 1,
 'ethical': 1,
 'dilemmas': 1,
 'navigate': 1,
 'digital': 1,
 'age': 1,
 'essential': 1,
 'harness': 1,
 'power': 1,
 'responsibly': 1,
 'ensuring': 1,
 'serves': 1,
 'greater': 1,
 'good': 1,
 'respecting': 1,
 'individual': 1,
 'rights': 1,
 'societal': 1,
 'values': 1,
 'leveraging': 1,
 'innovation': 1,
 'collaboration': 1,
 'address': 1,
 'create': 1,
 'brighter': 1,
 'future': 1,
 'generati

In [11]:
max_frequency = max(word_frequrncies.values())
max_frequency

3

In [12]:
listOfMax = [text for text in word_frequrncies if  len(text) == max_frequency]
listOfMax

['way', 'big', 'age']

In [13]:
for word in word_frequrncies.keys():
    word_frequrncies[word] = word_frequrncies[word] / max_frequency
word_frequrncies

{'today': 0.3333333333333333,
 'fast': 0.3333333333333333,
 'paced': 0.3333333333333333,
 'world': 0.6666666666666666,
 'technology': 1.0,
 'plays': 0.3333333333333333,
 'crucial': 0.3333333333333333,
 'role': 0.3333333333333333,
 'aspect': 0.3333333333333333,
 'lives': 0.3333333333333333,
 'communication': 0.3333333333333333,
 'transportation': 0.3333333333333333,
 'healthcare': 0.3333333333333333,
 'entertainment': 0.3333333333333333,
 'revolutionized': 0.3333333333333333,
 'way': 0.3333333333333333,
 'live': 0.3333333333333333,
 'work': 0.3333333333333333,
 'interact': 0.3333333333333333,
 'advent': 0.3333333333333333,
 'artificial': 0.3333333333333333,
 'intelligence': 0.3333333333333333,
 'machine': 0.3333333333333333,
 'learning': 0.3333333333333333,
 'big': 0.3333333333333333,
 'data': 0.3333333333333333,
 'analytics': 0.3333333333333333,
 'witnessing': 0.3333333333333333,
 'unprecedented': 0.3333333333333333,
 'advancements': 0.3333333333333333,
 'fields': 0.3333333333333333,
 

In [14]:
sentence_tokens = [snt for snt in document.sents]
sentence_tokens

[
 In today's fast-paced world, technology plays a crucial role in every aspect of our lives.,
 From communication to transportation, healthcare to entertainment, technology has revolutionized the way we live, work, and interact with the world around us.,
 With the advent of artificial intelligence, machine learning, and big data analytics, we are witnessing unprecedented advancements in various fields.,
 However, along with the benefits come challenges such as cybersecurity threats, privacy concerns, and ethical dilemmas.,
 As we navigate this digital age, it's essential to harness the power of technology responsibly, ensuring that it serves the greater good while respecting individual rights and societal values.,
 By leveraging innovation and collaboration, we can address these challenges and create a brighter future for generations to come.]

In [15]:
sentence_scores = {}
for snt in sentence_tokens:
    for word in snt:
        if word.text.lower() in word_frequrncies.keys():
            if snt not in sentence_scores.keys():
                sentence_scores[snt] = word_frequrncies[word.text.lower()] 
            else:
                sentence_scores[snt] += word_frequrncies[word.text.lower()] 

In [16]:
sentence_scores

{
 In today's fast-paced world, technology plays a crucial role in every aspect of our lives.: 4.333333333333333,
 From communication to transportation, healthcare to entertainment, technology has revolutionized the way we live, work, and interact with the world around us.: 4.666666666666667,
 With the advent of artificial intelligence, machine learning, and big data analytics, we are witnessing unprecedented advancements in various fields.: 4.0,
 However, along with the benefits come challenges such as cybersecurity threats, privacy concerns, and ethical dilemmas.: 3.666666666666667,
 As we navigate this digital age, it's essential to harness the power of technology responsibly, ensuring that it serves the greater good while respecting individual rights and societal values.: 6.333333333333331,
 By leveraging innovation and collaboration, we can address these challenges and create a brighter future for generations to come.: 4.000000000000001}

In [17]:
from heapq import nlargest

In [18]:
select_length = 3

In [19]:
summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

In [20]:
summary

[As we navigate this digital age, it's essential to harness the power of technology responsibly, ensuring that it serves the greater good while respecting individual rights and societal values.,
 From communication to transportation, healthcare to entertainment, technology has revolutionized the way we live, work, and interact with the world around us.,
 
 In today's fast-paced world, technology plays a crucial role in every aspect of our lives.]

## All in one big function 

In [21]:
from heapq import nlargest
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

def Summarize(texto,target):
    ## get the stop words from spacy library
    stopwords = list(STOP_WORDS)

    nlp = spacy.load('en_core_web_sm')

    ##store the text in document of type nlp
    document = nlp(texto)
    tokens = [token.text for token in document]
    
    extra_punctuation = '\n'
    ##remove the punctuations and special characters
    word_frequrncies = {}
    for word in document:
        if word.text.lower() not in stopwords:
            if word.text.lower() not in punctuation + extra_punctuation:
                if word.text.lower() not in word_frequrncies:
                    word_frequrncies[word.text.lower()] = 1
                else:
                    word_frequrncies[word.text.lower()] += 1

    max_frequency = max(word_frequrncies.values())

    ##normalize the word count into  a percentage value between 0 to 1
    for word in word_frequrncies.keys():
        word_frequrncies[word] = word_frequrncies[word] / max_frequency

    sentence_tokens = [snt for snt in document.sents]

    sentence_scores = {}
    for snt in sentence_tokens:
        for word in snt:
            if word.text.lower() in word_frequrncies.keys():
                if snt not in sentence_scores.keys():
                    sentence_scores[snt] = word_frequrncies[word.text.lower()] 
                else:
                    sentence_scores[snt] += word_frequrncies[word.text.lower()]
    
    select_length = target

    summary = nlargest(select_length, sentence_scores, key = sentence_scores.get)

    summary_text = " ".join(sentence.text for sentence in summary)

    return summary_text

In [22]:
# !pip install customtkinter
# !pip install tk

## The GUI

In [24]:
import customtkinter

def theGui():
    customtkinter.set_appearance_mode("System")  # Modes: system (default), light, dark
    customtkinter.set_default_color_theme("blue")  # Themes: blue (default), dark-blue, green

    app = customtkinter.CTk()  # create CTk window like you do with the Tk window
    app.title("Summarizer")
    app.geometry("720x680")
    app.grid_columnconfigure(0, weight=1)
    def button_function():
        print("Done")
        if(output.get("0.0", "end")):
            output.delete("0.0", "end")  
        text = intextbox.get("0.0", "end")
        try:
            nOfSent = int(entry.get())
            thesuma = Summarize(text,nOfSent)
            output.insert("0.0", thesuma) 
        except:
            output.insert("0.0", "Not valid number of sentences")

            

    label = customtkinter.CTkLabel(app, text="Text summarization using text extraction", fg_color="transparent" ,  font = ('poppins medium',20))
    label.grid(column = 0, row = 0, padx = 10, pady = 10)
    intextbox = customtkinter.CTkTextbox(app, width=500, height= 250)
    # textbox.pack(padx = 2, pady = 10)
    intextbox.grid(column = 0, row = 1, padx = 10, pady = 10 )
    # Use CTkButton instead of tkinter Button
    button = customtkinter.CTkButton(master=app, text="Summarize", command=button_function)
    button.grid(column = 0, row = 2 , padx = 10)

    entry = customtkinter.CTkEntry(app, placeholder_text="n# of sentences")
    entry.grid(column = 0, row = 3, padx = 10, pady = 10)

    output = customtkinter.CTkTextbox(app, width=500, height= 250)
    # textbox.pack(padx = 2, pady = 10)
    output.grid(column = 0, row = 4, padx = 10, pady = 10 )


    app.mainloop()
theGui()

Done
