<a href="https://colab.research.google.com/github/mayankDhiman/wavelet-tree-text-summarisation/blob/main/text_summarisation_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Text Summarisation Neural Network Model

In [3]:
import nltk
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx

In [6]:
text='The coronavirus outbreak came to light on December 31, 2019 when China informed the World Health Organisation of a cluster of cases of pneumonia of an unknown cause in Wuhan City in Hubei Province. Subsequently the disease spread to more Provinces in China, and to the rest of the world. The WHO has now declared it a pandemic. The virus has been named SARS-CoV-2 and the disease is now called COVID-19'

In [7]:
text=text.split(".")

In [8]:
print(text)

['The coronavirus outbreak came to light on December 31, 2019 when China informed the World Health Organisation of a cluster of cases of pneumonia of an unknown cause in Wuhan City in Hubei Province', ' Subsequently the disease spread to more Provinces in China, and to the rest of the world', ' The WHO has now declared it a pandemic', ' The virus has been named SARS-CoV-2 and the disease is now called COVID-19']


In [9]:
corpus=[]
for sent in text:
  corpus.append(sent.replace("[^a-zA-z]"," ").split(" "))
print(corpus)

[['The', 'coronavirus', 'outbreak', 'came', 'to', 'light', 'on', 'December', '31,', '2019', 'when', 'China', 'informed', 'the', 'World', 'Health', 'Organisation', 'of', 'a', 'cluster', 'of', 'cases', 'of', 'pneumonia', 'of', 'an', 'unknown', 'cause', 'in', 'Wuhan', 'City', 'in', 'Hubei', 'Province'], ['', 'Subsequently', 'the', 'disease', 'spread', 'to', 'more', 'Provinces', 'in', 'China,', 'and', 'to', 'the', 'rest', 'of', 'the', 'world'], ['', 'The', 'WHO', 'has', 'now', 'declared', 'it', 'a', 'pandemic'], ['', 'The', 'virus', 'has', 'been', 'named', 'SARS-CoV-2', 'and', 'the', 'disease', 'is', 'now', 'called', 'COVID-19']]


In [10]:
corpus.pop()
print(corpus)

[['The', 'coronavirus', 'outbreak', 'came', 'to', 'light', 'on', 'December', '31,', '2019', 'when', 'China', 'informed', 'the', 'World', 'Health', 'Organisation', 'of', 'a', 'cluster', 'of', 'cases', 'of', 'pneumonia', 'of', 'an', 'unknown', 'cause', 'in', 'Wuhan', 'City', 'in', 'Hubei', 'Province'], ['', 'Subsequently', 'the', 'disease', 'spread', 'to', 'more', 'Provinces', 'in', 'China,', 'and', 'to', 'the', 'rest', 'of', 'the', 'world'], ['', 'The', 'WHO', 'has', 'now', 'declared', 'it', 'a', 'pandemic']]


In [11]:
def sent_sim(s1,s2,stopwords=None):
  if stopwords is None:
    stopwords=[]
  
  s1=[w.lower() for w in s1]
  s2=[w.lower() for w in s2]

  all=list(set(s1+s2))
  v1=[0]*len(all)
  v2=[0]*len(all)
  stopwords = set(stopwords.words("english"))

  for w in s1:
    if w in stopwords:
      continue
    v1[all.index(w)] +=1
  
  for w in s2:
    if w in stopwords:
      continue
    v2[all.index(w)] +=1
  
  return 1 - cosine_distance(v1,v2)

In [12]:
def build_simi_mat(sent,stopword):
  simi_mat=np.zeros((len(sent), len(sent)))
  
  for idx1 in range(len(sent)):
    for idx2 in range(len(sent)):
      if idx1==idx2:
        continue
      simi_mat[idx1][idx2]=sent_sim(sent[idx1], sent[idx2] , stopwords)
  
  return simi_mat

In [13]:
def gen_summary(top_n=5):
  stop_words=stopwords.words('english')
  summerize_text=[]
  txt=corpus
  sent_simi_mat = build_simi_mat(txt,stop_words)

  sent_simi_graph = nx.from_numpy_array(sent_simi_mat)
  score=nx.pagerank(sent_simi_graph)

  ranked_sent = sorted(((score[i],s) for i,s in enumerate(txt)), reverse=True)
  print("top ranked senetnce" , ranked_sent)

  for i in range(top_n):
    summerize_text.append(" ".join(ranked_sent[i][1]))

  print("Summarize Text: \n",".".join(summerize_text))  

In [14]:
gen_summary(2)

top ranked senetnce [(0.48648582432442083, ['', 'Subsequently', 'the', 'disease', 'spread', 'to', 'more', 'Provinces', 'in', 'China,', 'and', 'to', 'the', 'rest', 'of', 'the', 'world']), (0.3500905928819326, ['', 'The', 'WHO', 'has', 'now', 'declared', 'it', 'a', 'pandemic']), (0.16342358279364622, ['The', 'coronavirus', 'outbreak', 'came', 'to', 'light', 'on', 'December', '31,', '2019', 'when', 'China', 'informed', 'the', 'World', 'Health', 'Organisation', 'of', 'a', 'cluster', 'of', 'cases', 'of', 'pneumonia', 'of', 'an', 'unknown', 'cause', 'in', 'Wuhan', 'City', 'in', 'Hubei', 'Province'])]
Summarize Text: 
  Subsequently the disease spread to more Provinces in China, and to the rest of the world. The WHO has now declared it a pandemic
