<a href="https://colab.research.google.com/github/nisha1365/Hands-on-NLP/blob/main/Text_Summarization_Cosine_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Summarization - Cosine Similarity

## Preparing the enviroment

In [3]:
import re
import nltk 
import string
import numpy as np
import networkx as nx #graph
from nltk.cluster.util import cosine_distance

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [5]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text

In [6]:
original_text = """Artifical intelligence is human like intelligence.
                  It is the study of intelligent artifical agents.
                  Science and engineering to produce intelligent machines.
                  Solve problems and have intelligence.
                  Related to intelligent behavior.
                  Developing of resoning machines.
                  Learn from mistakes and successes.
                  Artifical intelligence is related to reasoning in everyday situations."""
original_text = re.sub(r'\s+', ' ', original_text)
original_text                  

'Artifical intelligence is human like intelligence. It is the study of intelligent artifical agents. Science and engineering to produce intelligent machines. Solve problems and have intelligence. Related to intelligent behavior. Developing of resoning machines. Learn from mistakes and successes. Artifical intelligence is related to reasoning in everyday situations.'

## Function to calculate similairty between sentences

In [7]:
original_sentences = [sentence for sentence in nltk.sent_tokenize(original_text)]
original_sentences

['Artifical intelligence is human like intelligence.',
 'It is the study of intelligent artifical agents.',
 'Science and engineering to produce intelligent machines.',
 'Solve problems and have intelligence.',
 'Related to intelligent behavior.',
 'Developing of resoning machines.',
 'Learn from mistakes and successes.',
 'Artifical intelligence is related to reasoning in everyday situations.']

In [8]:
formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
formatted_sentences

['artifical intelligence human like intelligence',
 'study intelligent artifical agents',
 'science engineering produce intelligent machines',
 'solve problems intelligence',
 'related intelligent behavior',
 'developing resoning machines',
 'learn mistakes successes',
 'artifical intelligence related reasoning everyday situations']

In [35]:
def calculate_sentence_similarity(sentence1, sentence2):
  words1 = [word for word in nltk.word_tokenize(sentence1)]
  words2 = [word for word in nltk.word_tokenize(sentence2)]
  #print(words1)
  #print(words2)

  all_words = list(set(words1 + words2))
  #print(all_words)

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)
  #print(vector1)
  #print(vector2)

  for word in words1: #Bag of words
    #print(word)
    vector1[all_words.index(word)] += 1
  for word in words2:
    vector2[all_words.index(word)]  += 1

  #print(vector1)  
  #print(vector2)

  return 1 - cosine_distance(vector1, vector2)

In [36]:
calculate_sentence_similarity(formatted_sentences[0], formatted_sentences[4])

0.0

In [24]:
test = ['human', 'intelligence', 'intelligent', 'study', 'artifical', 'like', 'agents']
test.index('study')

3

## Similarity Matrix

*   List item

*   List item
*   List item


*   List item



In [45]:
def calculate_similarity_matrix(sentences):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
  #print(similarity_matrix)
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j:
        continue
      similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
    return similarity_matrix     

      

In [46]:
calculate_similarity_matrix(formatted_sentences)

array([[0.        , 0.18898224, 0.        , 0.43643578, 0.        ,
        0.        , 0.        , 0.46291005],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])