# Preparing the environment

In [None]:
import re
import nltk
import string
import heapq

from IPython.core.display import HTML

!pip install feedparser

import feedparser
from bs4 import BeautifulSoup
import os
import json

nltk.download('punkt')
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('english')
stopwords.append('explanation')

def clean_html(text):
  if text == '':
    return ''
  else:
    return BeautifulSoup(text, 'html5lib').get_text()


def visualize(title, sentence_list, best_sentences):
  text = ''

  display(HTML(f'<h3>{title}</h3>'))
  for sentence in sentence_list:
    if sentence in best_sentences:
      text += ' ' + str(sentence).replace(sentence, f"<mark>{sentence}</mark>")
    else:
      text += ' ' + sentence
  display(HTML(f""" {text} """))


url = 'https://www.aitrends.com/feed/'
feed = feedparser.parse(url)

articles = []
for e in feed.entries:
  articles.append({'title': e.title, 'content': clean_html(e.content[0].value)})

save_file = os.path.join('feed.json')
feed = open(save_file, 'w+')
feed.write(json.dumps(articles, indent=1))
feed.close()

blog_articles = json.loads(open('feed.json').read())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocess(text):
  formatted_text = text.lower()
  tokens = []
  for token in nltk.word_tokenize(formatted_text):
    tokens.append(token)
  tokens = [word for word in tokens if word not in stopwords and word not in string.punctuation]
  formatted_text = ' '.join(element for element in tokens)

  return formatted_text


def calculate_sentences_score(sentences, important_words, distance):
  scores = []
  sentence_index = 0

  for sentence in [nltk.word_tokenize(sentence) for sentence in sentences]:

    word_index = []
    for word in important_words:
      try:
        word_index.append(sentence.index(word))
      except ValueError:
        pass

    word_index.sort()

    if len(word_index) == 0:
      continue

    groups_list = []
    group = [word_index[0]]
    i = 1 
    while i < len(word_index): 
      if word_index[i] - word_index[i - 1] < distance:
        group.append(word_index[i])
      else:
        groups_list.append(group[:])
        group = [word_index[i]]
      i += 1
    groups_list.append(group)

    max_group_score = 0
    for g in groups_list:
      important_words_in_group = len(g)
      total_words_in_group = g[-1] - g[0] + 1
      score = 1.0 * important_words_in_group**2 / total_words_in_group

      if score > max_group_score:
        max_group_score = score

    scores.append((max_group_score, sentence_index))
    sentence_index += 1

  return scores


def luhn_summarize(text, top_n_words, distance, number_of_sentences, percentage = 0):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  words = [word for sentence in formatted_sentences for word in nltk.word_tokenize(sentence)]
  frequency = nltk.FreqDist(words)
  top_n_words = [word[0] for word in frequency.most_common(top_n_words)]
  sentences_score = calculate_sentences_score(formatted_sentences, top_n_words, distance)

  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(formatted_sentences) * percentage), sentences_score)
  else:  
    best_sentences = heapq.nlargest(number_of_sentences, sentences_score)

  best_sentences = [original_sentences[i] for (score, i) in best_sentences]

  return original_sentences, best_sentences, sentences_score


# Summarizing the articles

In [None]:
#@title 
for article in blog_articles:
    if (len(article['content'])) == 8715:
        original_sentences, best_sentences, _ = luhn_summarize(article['content'], 150, 10, number_of_sentences=5, percentage=0.3)
        print("best_sentences", len(best_sentences))
        visualize("Summarization based on Luhn Algorithm", original_sentences, best_sentences)


best_sentences 20


In [None]:
def freq_summarize(text, number_of_sentences, percentage):
  original_text = text
  formatted_text = preprocess(original_text)

  word_frequency = nltk.FreqDist(nltk.word_tokenize(formatted_text))
  highest_frequency = max(word_frequency.values())
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word] / highest_frequency)
  sentence_list = nltk.sent_tokenize(original_text)
  
  score_sentences = {}
  for sentence in sentence_list:
    for word in nltk.word_tokenize(sentence):
      if word in word_frequency.keys():
        if sentence not in score_sentences.keys():
          score_sentences[sentence] = word_frequency[word]
        else:
          score_sentences[sentence] += word_frequency[word]

  import heapq
  if percentage > 0:
    best_sentences = heapq.nlargest(int(len(sentence_list) * percentage), score_sentences, key=score_sentences.get)
  else:
    best_sentences = heapq.nlargest(number_of_sentences, score_sentences, key=score_sentences.get)

  return sentence_list, best_sentences, word_frequency, score_sentences


for article in blog_articles:
    if (len(article['content'])) == 8715:
        original_sentences, best_sentences, _, _ = freq_summarize(article['content'], 100, percentage=0.3)
        print("best_sentences", len(best_sentences))
        visualize("Summarization based on Frequency", original_sentences, best_sentences)

best_sentences 20


In [None]:
import numpy as np 
import networkx as nx
from nltk.cluster.util import cosine_distance

def calculate_sentence_similarity(sentence1, sentence2):
  words1 = [word for word in nltk.word_tokenize(sentence1)]
  words2 = [word for word in nltk.word_tokenize(sentence2)]

  all_words = list(set(words1 + words2))

  vector1 = [0] * len(all_words)
  vector2 = [0] * len(all_words)

  for word in words1: # Bag of words
    vector1[all_words.index(word)] += 1

  for word in words2:
    vector2[all_words.index(word)] += 1

  return 1 - cosine_distance(vector1, vector2)


def calculate_similarity_matrix(sentences):
  similarity_matrix = np.zeros((len(sentences), len(sentences)))
  
  for i in range(len(sentences)):
    for j in range(len(sentences)):
      if i == j:
        continue
      similarity_matrix[i][j] = calculate_sentence_similarity(sentences[i], sentences[j])
  return similarity_matrix
  

def cosine_summarize(text, number_of_sentences, percentage):
  original_sentences = [sentence for sentence in nltk.sent_tokenize(text)]
  formatted_sentences = [preprocess(original_sentence) for original_sentence in original_sentences]
  similarity_matrix = calculate_similarity_matrix(formatted_sentences)

  similarity_graph = nx.from_numpy_array(similarity_matrix)

  scores = nx.pagerank(similarity_graph)
  ordered_scores = sorted(((scores[i], score) for i, score in enumerate(original_sentences)), reverse=True)

  if percentage > 0:
    number_of_sentences = int(len(formatted_sentences) * percentage)

  best_sentences = []
  for sentence in range(number_of_sentences):
    best_sentences.append(ordered_scores[sentence][1])
  
  return original_sentences, best_sentences, ordered_scores

for article in blog_articles:
    if (len(article['content'])) == 8715:
        original_sentences, best_sentences, ordered_scores = cosine_summarize(article['content'], 100, percentage=0.3)
        print("original_sentences", len(original_sentences))
        print("best_sentences", len(best_sentences))
        visualize("Summarization based on Cosine Similarity", original_sentences, best_sentences)

original_sentences 67
best_sentences 20


In [None]:
best_sentences

['“We are adopting an auditor’s perspective on the AI accountability framework,” Ariga said.',
 '“Data is critical to the AI system and is the place where a lot of problems can exist.” Goodman said.',
 '“We are preparing to continually monitor for model drift and the fragility of algorithms, and we are scaling the AI appropriately.” The evaluations will determine whether the AI system continues to meet the need “or whether a sunset is more appropriate,” Ariga said.',
 'The resulting framework was first published in June as what Ariga described as “version 1.0.”\xa0\xa0\nSeeking to Bring a “High-Altitude Posture” Down to Earth\xa0\xa0\n“We found the AI accountability framework had a very high-altitude posture,” Ariga said.',
 'It’s the only way we can ensure\xa0that the AI\xa0is developed responsibly.”\xa0\xa0\nLastly, “AI is not magic.',
 '“Those are well-conceived, but it’s not obvious to an engineer how to translate them into a specific project requirement,” Good said in a presentati

In [None]:
def convert_to_para(sentence_list, best_sentences):
    # summarized_text = ''
    original_text = ''

    for sentence in sentence_list:
        print(sentence)

        # if sentence in best_sentences:
        #     summarized_text += str(sentence) # .replace(sentence, f"<mark>{sentence}</mark>")
        # else:
        #     continue
            # text += ' ' + sentence

    # print(summarized_text)
    # print("---")
    # print(original_text)


convert_to_para(original_sentences, best_sentences)

By John P. Desmond, AI Trends Editor   
Two experiences of how AI developers within the federal government are pursuing AI accountability practices were outlined at the AI World Government event held virtually and in-person this week in Alexandria, Va. 
Taka Ariga, chief data scientist and director, US Government Accountability Office
Taka Ariga, chief data scientist and director at the US Government Accountability Office, described an AI accountability framework he uses within his agency and plans to make available to others.
And Bryce Goodman, chief strategist for AI and machine learning at the Defense Innovation Unit (DIU), a unit of the Department of Defense founded to help the US military make faster use of emerging commercial technologies, described work in his unit to apply principles of AI development to terminology that an engineer can apply.
Ariga, the first chief data scientist appointed to the US Government Accountability Office and director of the GAO’s Innovation Lab, dis

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


[{'rouge-1': {'f': 0.7567567517604091,
   'p': 0.7777777777777778,
   'r': 0.7368421052631579},
  'rouge-2': {'f': 0.514285709289796, 'p': 0.5294117647058824, 'r': 0.5},
  'rouge-l': {'f': 0.7567567517604091,
   'p': 0.7777777777777778,
   'r': 0.7368421052631579}}]

In [None]:
from rouge import Rouge

model_out = "he began by starting a five person war cabinet and included chamberlain as lord president of the council"
reference = "he began his premiership by forming a five-man war cabinet which included chamberlain as lord president of the council"

rouge = Rouge()
rouge.get_scores(model_out, reference)

In [None]:
!pip install sumeval
!python -m spacy download en

Collecting sumeval
  Downloading sumeval-0.2.2.tar.gz (80 kB)
[K     |████████████████████████████████| 80 kB 5.2 MB/s 
Collecting sacrebleu>=1.3.2
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 10.0 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting portalocker
  Downloading portalocker-2.4.0-py2.py3-none-any.whl (16 kB)
Building wheels for collected packages: sumeval
  Building wheel for sumeval (setup.py) ... [?25l[?25hdone
  Created wheel for sumeval: filename=sumeval-0.2.2-py3-none-any.whl size=54549 sha256=fb517336ec8000b936809925e927304bde8d701571eff22a411fe5c0b06baf3c
  Stored in directory: /root/.cache/pip/wheels/f4/3f/31/c521bdfba2be7518bd94ba3e8b982812822167cc0497fad192
Successfully built sumeval
Installing collected packages: portalocker, colorama, sacrebleu, sumeval
Successfully installed colorama-0.4.4 portalocker-2.4.0 sacrebleu-2.0.0 sumeval-0.2.2
Collecting en_core_w

In [None]:
#https://github.com/chakki-works/sumeval
#https://github.com/Tian312/awesome-text-summarization

from sumeval.metrics.rouge import RougeCalculator

refrence_summary = "So easy, so good and all natural"
model_summary = "great hot snack"

rouge = RougeCalculator(stopwords=True, lang="en")

rouge_1 = rouge.rouge_n(
            summary=model_summary,
            references=refrence_summary,
            n=1)

rouge_2 = rouge.rouge_n(
            summary=model_summary,
            references=[refrence_summary],
            n=2)

rouge_l = rouge.rouge_l(
            summary=model_summary,
            references=[refrence_summary])

# You need spaCy to calculate ROUGE-BE

rouge_be = rouge.rouge_be(
            summary=model_summary,
            references=[refrence_summary])

print("ROUGE-1: {}, ROUGE-2: {}, ROUGE-L: {}, ROUGE-BE: {}".format(
    rouge_1, rouge_2, rouge_l, rouge_be
).replace(", ", "\n"))

b.great=(amod)=>snack
b.hot=(amod)=>snack
ROUGE-1: 0
ROUGE-2: 0
ROUGE-L: 0
ROUGE-BE: 0
