In [1]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation

In [2]:
text = """ Load shedding is one of the common incidents in our life. We almost every day experience load shedding. When the power supply can not meet the demand, it leads to load shedding. It causes a crisis in the electricity supply. The unplanned distribution of electricity is the main cause of load shedding. The reason can be the illegal connection and shortage of the production of electricity. People suffer a lot because of load shedding. They can’t work properly. Students can’t study properly. """

In [3]:
len(text)

494

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
doc = nlp(text)

In [6]:
tokens = [token.text for token in doc] 
#Here use list comprehension
#In spaCy, the token.text is  attribute, it's splite every word.

print(tokens)

[' ', 'Load', 'shedding', 'is', 'one', 'of', 'the', 'common', 'incidents', 'in', 'our', 'life', '.', 'We', 'almost', 'every', 'day', 'experience', 'load', 'shedding', '.', 'When', 'the', 'power', 'supply', 'can', 'not', 'meet', 'the', 'demand', ',', 'it', 'leads', 'to', 'load', 'shedding', '.', 'It', 'causes', 'a', 'crisis', 'in', 'the', 'electricity', 'supply', '.', 'The', 'unplanned', 'distribution', 'of', 'electricity', 'is', 'the', 'main', 'cause', 'of', 'load', 'shedding', '.', 'The', 'reason', 'can', 'be', 'the', 'illegal', 'connection', 'and', 'shortage', 'of', 'the', 'production', 'of', 'electricity', '.', 'People', 'suffer', 'a', 'lot', 'because', 'of', 'load', 'shedding', '.', 'They', 'ca', 'n’t', 'work', 'properly', '.', 'Students', 'ca', 'n’t', 'study', 'properly', '.']


In [7]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [8]:
word_freq = {} #Here use dictionary for count word

stop_words = list(STOP_WORDS) #STOP_WORDS is a keyword of spacy, list of Stop word.

for word in doc: 
  if word.text.lower() not in  stop_words: #here word.text means it represent doc word,
        #if we want to access exect every token in word then we should use word.text
        #word.text.lower() convert Upper case to lower case word 
        #if this loop specific word is absent from stop_words list then this loop will continue
    if word.text.lower() not in punctuation:
            #if this loop specific word is absent from punctuation list then this loop will continue
        if word.text not in word_freq.keys():
                #if this loop specific word is absent from word_freq.keys() dictionary then this loop will continue and set value 1.
            word_freq[word.text] = 1
        else:
            word_freq[word.text] += 1  
                #here 1 will be added if the word already available in word_freq.keys()


In [9]:
print(word_freq) #every word frequency list

{' ': 1, 'Load': 1, 'shedding': 5, 'common': 1, 'incidents': 1, 'life': 1, 'day': 1, 'experience': 1, 'load': 4, 'power': 1, 'supply': 2, 'meet': 1, 'demand': 1, 'leads': 1, 'causes': 1, 'crisis': 1, 'electricity': 3, 'unplanned': 1, 'distribution': 1, 'main': 1, 'cause': 1, 'reason': 1, 'illegal': 1, 'connection': 1, 'shortage': 1, 'production': 1, 'People': 1, 'suffer': 1, 'lot': 1, 'work': 1, 'properly': 2, 'Students': 1, 'study': 1}


In [10]:
max_freq = max(word_freq.values())
print(max_freq) #here max frequency is 5

5


In [11]:
print(word_freq.keys())

dict_keys([' ', 'Load', 'shedding', 'common', 'incidents', 'life', 'day', 'experience', 'load', 'power', 'supply', 'meet', 'demand', 'leads', 'causes', 'crisis', 'electricity', 'unplanned', 'distribution', 'main', 'cause', 'reason', 'illegal', 'connection', 'shortage', 'production', 'People', 'suffer', 'lot', 'work', 'properly', 'Students', 'study'])


In [12]:
for word in word_freq.keys(): 
  word_freq[word] = word_freq[word] / max_freq #Here we diveded every word frequency with 5

In [13]:
print(word_freq)

{' ': 0.2, 'Load': 0.2, 'shedding': 1.0, 'common': 0.2, 'incidents': 0.2, 'life': 0.2, 'day': 0.2, 'experience': 0.2, 'load': 0.8, 'power': 0.2, 'supply': 0.4, 'meet': 0.2, 'demand': 0.2, 'leads': 0.2, 'causes': 0.2, 'crisis': 0.2, 'electricity': 0.6, 'unplanned': 0.2, 'distribution': 0.2, 'main': 0.2, 'cause': 0.2, 'reason': 0.2, 'illegal': 0.2, 'connection': 0.2, 'shortage': 0.2, 'production': 0.2, 'People': 0.2, 'suffer': 0.2, 'lot': 0.2, 'work': 0.2, 'properly': 0.4, 'Students': 0.2, 'study': 0.2}


In [14]:
sent_tokens = [sent for sent in doc.sents] #Here sentences are tokenized with doc.sents attribute
print(sent_tokens)

[ Load shedding is one of the common incidents in our life., We almost every day experience load shedding., When the power supply can not meet the demand, it leads to load shedding., It causes a crisis in the electricity supply., The unplanned distribution of electricity is the main cause of load shedding., The reason can be the illegal connection and shortage of the production of electricity., People suffer a lot because of load shedding., They can’t work properly., Students can’t study properly.]


In [15]:
sent_score = {}

In [16]:
for sent in sent_tokens:
  for word in sent:
    if word.text.lower()in word_freq.keys():
      if sent not in sent_score.keys():
        sent_score[sent] = word_freq[word.text.lower()]
      else:
        sent_score[sent] += word_freq[word.text.lower()]  
# Here the frequency of the sentence is extracted

In [17]:
sent_score #Here extracted value of every sentence frequency

{ Load shedding is one of the common incidents in our life.: 2.6000000000000005,
 We almost every day experience load shedding.: 2.2,
 When the power supply can not meet the demand, it leads to load shedding.: 3.0,
 It causes a crisis in the electricity supply.: 1.4,
 The unplanned distribution of electricity is the main cause of load shedding.: 3.2,
 The reason can be the illegal connection and shortage of the production of electricity.: 1.6,
 People suffer a lot because of load shedding.: 2.2,
 They can’t work properly.: 0.6000000000000001,
 Students can’t study properly.: 0.6000000000000001}

In [18]:
from heapq import nlargest

In [19]:
len(sent_score) * 0.3 #The expression len(sent_score) * 0.3 calculates 30% of the length of the sent_score dictionary. 

2.6999999999999997

In [20]:
summary = nlargest(n = 4, iterable= sent_score, key = sent_score.get) #Specifies that we want to retrieve the top 4 elements.
#retrieving the top elements from the sent_score dictionary, and the sorting is done based on the scores associated with each 
#sentence.

In [21]:
print(summary) 

[The unplanned distribution of electricity is the main cause of load shedding., When the power supply can not meet the demand, it leads to load shedding.,  Load shedding is one of the common incidents in our life., We almost every day experience load shedding.]


In [22]:
final_summary = [word.text for word in summary] #final_summary variable becomes a list of the text of each word in the sentences 
#contained in the summarylist.

In [23]:
print(final_summary) #list of final_summary

['The unplanned distribution of electricity is the main cause of load shedding.', 'When the power supply can not meet the demand, it leads to load shedding.', ' Load shedding is one of the common incidents in our life.', 'We almost every day experience load shedding.']


In [24]:
summary = " ".join(final_summary) #joined  all sentence.

In [25]:
print(summary)

The unplanned distribution of electricity is the main cause of load shedding. When the power supply can not meet the demand, it leads to load shedding.  Load shedding is one of the common incidents in our life. We almost every day experience load shedding.


Orginal text data: 

Load shedding is one of the common incidents in our life. We almost every day experience load shedding. When the power supply can not meet the demand, it leads to load shedding. It causes a crisis in the electricity supply. The unplanned distribution of electricity is the main cause of load shedding. The reason can be the illegal connection and shortage of the production of electricity. People suffer a lot because of load shedding. They can’t work properly. Students can’t study properly.

In [26]:
len(summary) #Now we get len 256 and earlier our original text len was 494

256

In [27]:
len(summary)/ len(text) #calculates the ratio of the length of the summary list to the length of the text object. 

0.5182186234817814