In [1]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance

In [2]:
import numpy as np
import networkx as nx

In [3]:
def read_article(filename):
  file = open(filename , 'r')
  filedata = file.readlines()
 
  article = filedata[0].split('. ')
  sentences = []
  for i in article:
    sentences.append(i.replace('[^a-zA-Z' , ' ').split(' '))
  return sentences

In [4]:
def sentence_similarity(sent1 , sent2 , stop_words = None):
  if stop_words is None:
    stop_words = []

  sent1 = [w.lower() for w in sent1]
  sent2 = [w.lower() for w in sent2]

  all_words = list(set(sent1+sent2))

  vector1 = [0]*len(all_words)
  vector2 = [0]*len(all_words)
  for i in sent1:
    if i in stop_words:
      continue
    else:
      vector1[all_words.index(i)] +=1
  for i in sent2:
    if i in stop_words:
      continue
    else:
      vector2[all_words.index(i)] +=1

  return 1-cosine_distance(vector1 , vector2)

In [5]:
def build_similarity_matrix(sentences , stopwords):
  similarity_matrix = np.zeros([len(sentences) , len(sentences)])  
  for idx1 in range(len(sentences)):
    for idx2 in  range(len(sentences)):
      if idx1==idx2:
        continue
      else:
        similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1] , sentences[idx2 ], stopwords)
  return similarity_matrix

In [6]:
def summarize_text(filename , top_n=5):
  stop_words = stopwords.words('english')
  summarize_text = []
  sentences  = read_article(filename)

  similarity_matrix = build_similarity_matrix(sentences , stop_words)

  sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
  scores = nx.pagerank(sentence_similarity_graph)


  ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)   
  
  for i in range(top_n):
    summarize_text.append(' '.join(ranked_sentence[i][1]))
  print("Summarize Text: \n", ". ".join(summarize_text))



In [7]:
summarize_text('cricket.txt' , 2)

Summarize Text: 
 Cricket is a bat-and-ball game played between two teams of eleven players on a field at the centre of which is a 22-yard (20-metre) pitch with a wicket at each end, each comprising two bails balanced on three stumps. Means of dismissal include being bowled, when the ball hits the stumps and dislodges the bails, and by the fielding side catching the ball after it is hit by the bat, but before it hits the ground
