In [0]:
! wget http://nlp.stanford.edu/data/glove.6B.zip
! unzip glove*.zip

In [0]:
import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')# one time execution
from nltk.corpus import stopwords
nltk.download('punkt') # one time execution

import re
import json
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

import hashlib 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
with open('/content/data_meeting_text_amazon.txt') as data_file:
  _data = json.load(data_file)
  sentences = [d['sentence'].lower() for d in _data]
  speakers = [d['speaker'] for d in _data]

In [0]:
stop_words = stopwords.words('english')

In [0]:
def remove_stopwords(sen):
  sen_new = " ".join([i for i in sen if i not in stop_words])
  return sen_new

In [0]:
clean_sentences = [remove_stopwords(r.split()) for r in sentences]

In [0]:
clean_sentences

In [0]:
def get_word_embedings(model_name):
  word_embeddings = dict()
  f = open(model_name, encoding='utf-8')
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
  f.close()
  return word_embeddings

In [0]:
def get_sentence_vectors(word_embeddings, sentences, dim):
  sentence_vectors = list()
  for i in clean_sentences:
    if len(i) != 0:
      v = sum([word_embeddings.get(w, np.zeros((dim,))) for w in i.split()])/(len(i.split())+0.001)
    else:
      v = np.zeros((dim,))
    sentence_vectors.append(v)
  return sentence_vectors

In [0]:
def get_similarity_matrix(size, sentence_vectors, dim):
  sim_mat = np.zeros([size, size])
  for i in range(size):
    for j in range(size):
      if i != j:
        sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,dim), sentence_vectors[j].reshape(1,dim))[0,0]
  return sim_mat

In [0]:
def get_network_graph_scores(sim_mat):
  nx_graph = nx.from_numpy_array(sim_mat)
  scores = nx.pagerank(nx_graph)
  return scores

In [0]:
def get_sentene_ranking(sentences, ng_scores):
  ranked_sentences = sorted(((ng_scores[i],s) for i,s in enumerate(sentences)), reverse=True)
  return ranked_sentences

In [0]:
def page_rank_algorithm(model_file, dim):
  word_embeddings = get_word_embedings(model_file)
  sentence_vectors = get_sentence_vectors(word_embeddings, clean_sentences, dim)
  similarity_vector = get_similarity_matrix(len(sentences), sentence_vectors, dim)
  network_graph_score = get_network_graph_scores(similarity_vector)
  sentence_ranking = get_sentene_ranking(sentences, network_graph_score)
  return sentence_ranking

In [0]:
import csv

sentence_ranking = page_rank_algorithm('/content/glove.6B.100d.txt', 100)

with open('ranked_sentences_100d.csv', 'w') as output_file:
  writer = csv.writer(output_file)
  for sen in sentence_ranking:
    writer.writerow(list(sen))

In [0]:
def _create_dictionary_table(sentences) -> dict: 
  frequency_table = dict()
  stop_words = set(stopwords.words("english"))
  for text_string in sentences:
    words = word_tokenize(text_string)
    stem = PorterStemmer()
    for wd in words:
      wd = stem.stem(wd)
      if wd in stop_words:
        continue
      if wd in frequency_table:
        frequency_table[wd] += 1
      else:
        frequency_table[wd] = 1
  return frequency_table

In [0]:
def _calculate_sentence_scores(sentences, frequency_table) -> dict:   

  sentence_weight = dict()
  for sentence in sentences:
    sentence_wordcount = (len(word_tokenize(sentence)))
    sentence_wordcount_without_stop_words = 0
    for word_weight in frequency_table:
      if word_weight in sentence.lower():
        sentence_wordcount_without_stop_words += 1
        # sentence_hash = hashlib.md5(sentence.encode())
        sentence_hash = sentence
        if sentence_hash in sentence_weight:
            sentence_weight[sentence_hash] += frequency_table[word_weight]
        else:
            sentence_weight[sentence_hash] = frequency_table[word_weight]
    sentence_weight[sentence_hash] = (sentence_weight[sentence_hash] + 0.5) / (sentence_wordcount_without_stop_words + 1)
    
  return sentence_weight

In [0]:
def _calculate_average_score(sentence_weight) -> int:   
  sum_values = 0
  for entry in sentence_weight:
    sum_values += sentence_weight[entry]
  average_score = (sum_values / len(sentence_weight))
  return average_score

In [0]:
def _get_article_summary(sentences, sentence_weight, threshold):
  sentence_counter = 0
  article_summary = ''

  for sentence in sentences:
    # sentence_hash = hashlib.md5(sentence.encode())
    sentence_hash = sentence
    if sentence_hash in sentence_weight and sentence_weight[sentence_hash] >= (threshold):
      article_summary += " " + sentence
      sentence_counter += 1
  return article_summary

In [0]:
def sorted_weighted_frequency_sentences(sentences):
    
  #creating a dictionary for the word frequency table
  frequency_table = _create_dictionary_table(sentences)

  #algorithm for scoring a sentence by its words
  sentence_scores = _calculate_sentence_scores(sentences, frequency_table)
  sorted_sentence_scores = sorted([(key, value) for (key, value) in sentence_scores.items()], key=lambda x: x[1], reverse=False)

  #getting the threshold
  threshold = _calculate_average_score(sentence_scores)

  # #producing the summary
  article_summary = _get_article_summary(sentences, sentence_scores, 1.5 * threshold)

  return sorted_sentence_scores

In [0]:
_create_dictionary_table(sentences)

In [0]:
import csv

sentence_ranking = sorted_weighted_frequency_sentences(sentences)

with open('ranked_sentences_own.csv', 'w') as output_file:
  writer = csv.writer(output_file)
  for sen in sentence_ranking:
    writer.writerow(list(sen))

[('very.', 56.0),
 ('sorry about that.', 56.0),
 ('meetings.', 44.666666666666664),
 ('huh?.', 40.333333333333336),
 ('yes and no.', 39.0),
 ('yes.', 39.0),
 ('thank you.', 37.666666666666664),
 ('we have.', 37.666666666666664),
 ('they are my ideas.', 37.666666666666664),
 ('excuse me.', 37.666666666666664),
 ("it's your clock.", 36.25),
 ("look, i'm sorry.", 35.8),
 ("i'm in a meeting.", 34.75),
 ('currently,.', 34.2),
 ("yes, it's on bbc one.", 30.285714285714285),
 ('bye.', 29.5),
 ("oh, that's a shame.", 29.142857142857142),
 ('subject to conditions.', 29.0),
 ('we already do that.', 28.5),
 ("everybody's got to sign, however.", 26.25),
 ('first, the good news.', 26.0),
 ('now, getting back to the five points.', 26.0),
 ('get away from interruptions,.', 25.571428571428573),
 ('some directors run the company through frequent meetings.',
  24.666666666666668),
 ("that's to approve the accounts on.", 24.666666666666668),
 ("i've got to go.", 24.6),
 ("it's a legal requirement.", 24.5