In [83]:
import os
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
from enum import Enum
import re
import math

from typing import AnyStr, List, Set, Dict, Tuple, Callable

In [84]:
with open('data/stop_words_FULL.txt') as f:
    stop_words = {line for line in f.read().splitlines()}

with open('data/bonus_words.txt') as f:
    bonus_words = {line for line in f.read().splitlines()}

with open('data/stigma_words.txt') as f:
    stigma_words = {line for line in f.read().splitlines()}

nasari = dict()
with open('data/dd-small-nasari-15.txt', encoding='utf-8') as f:
    for line in f.readlines():
        splits = line.replace('\n', '').split(";")
        items = list()
        for item in splits[2:]:
            if '_' in item:
                word, score = item.split("_")
                items.append((word, score))
        nasari[splits[1].lower()] = items

### Pre-Processing

In [85]:
def bag_of_words(sentence: AnyStr) -> Set[AnyStr]:
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))


def remove_stopwords(words: List[AnyStr]) -> List[AnyStr]:
    return [value for value in words if value not in stop_words]


# Get tokens from sentence
def tokenize_sentence(sentence: AnyStr) -> List[AnyStr]:
    words = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        words.append(lmtzr.lemmatize(tag[0]).lower())
    return words


# Remove punctuation and multiple spaces
def remove_punctuation(sentence: AnyStr) -> AnyStr:
    return re.sub('\s\s+', ' ', re.sub(r'[^\w\s]', '', sentence))

In [86]:
def select_doc(docs: List[AnyStr]) -> AnyStr:
    print('Available documents to summarize:')
    for i, doc in enumerate(docs):
        print(f'[{i}] {doc}')
    doc_index = int(input(f'Choose the document (0-{len(docs)-1}): '))
    
    return docs[doc_index]


def parse_doc(doc_name: AnyStr) -> List[AnyStr]:
    with open(f'data/docs/{doc_name}') as f:
        lines = f.readlines()
    
    lines = list(filter(lambda x: x and not x.startswith('#'), (line.rstrip() for line in lines)))
    lines = [lines[0]] + [line for line in lines[1:] if len(line) > 30] # remove sub-titles
    return lines


### Weighted Overlap between Nasari vectors

In [87]:
def max_similarity(context: List[Tuple], topic: List[Tuple]) -> int:
    overlaps = [math.sqrt(weighted_overlap(context, topic))]
    return max(overlaps)


def rank(word: AnyStr, vector: List[Tuple]) -> int:
    for index, (elem, _) in enumerate(vector):
        if word == elem: return index + 1


def weighted_overlap(vector1: List[Tuple], vector2: List[Tuple]) -> float:
    overlap = 0
    if common_words := {v[0] for v in vector1} & {v[0] for v in vector2}:
        numerator = 0
        for w in common_words:
            numerator += (1 / (rank(w, vector1) + rank(w, vector2)))

        denominator = 0
        for i in range(len(common_words)):
            denominator += 1 / (2 * (i+1))
        
        overlap = numerator / denominator
    return overlap

In [88]:
def nasari_vectors_for_bow(bow: List[AnyStr]) -> Dict[AnyStr, List[Tuple]]:
    nasari_vectors = dict()
    for word in bow:
        if nasari_entry := nasari.get(word, None):
            nasari_vectors[word] = nasari_entry
    return nasari_vectors


def get_cue_score(paragraph: AnyStr) -> int:
    word_list = tokenize_sentence(remove_punctuation(paragraph))
    score = 0
    # score = len(set(word_list) & bonus_words) - len(set(word_list) & stigma_words)
    for word in word_list:
        if word in bonus_words: 
            score += 1
        elif word in stigma_words: 
            score -= 1
    return score
    

# We extract the nasari vectors from the title of the document
def get_title_topics(doc: List[AnyStr]) -> Dict[AnyStr, List[Tuple]]:
    title = doc[0]
    return nasari_vectors_for_bow(bag_of_words(title))


# We extract the nasari vectors from the most important paragraph of the document
# according to the cue score (+1 for each bonus_words present, -1 for each stigma_words present)
def get_topics(doc: List[AnyStr]) -> Dict[AnyStr, List[Tuple]]:
    scores = [(paragraph, get_cue_score(paragraph)) for paragraph in doc]
    most_important_paragraph = max(scores, key=lambda x: x[1])[0] # paragraph with maximum score associated
    return nasari_vectors_for_bow(bag_of_words(most_important_paragraph))


def summarization(doc: List[AnyStr], reduction: int, relevance_criteria: Callable) -> List[AnyStr]:
    topics = relevance_criteria(doc)

    paragraphs_overlap = list()
    for paragraph in doc[1:]: # excluding title
        paragraph_context = nasari_vectors_for_bow(bag_of_words(paragraph))
        total_overlap = 0
        average_paragraph_overlap = 0
        overlaps_count = 0

        # compute average paragraph overlap with the topics found before
        for context in paragraph_context.values():
            for topic in topics.values():
                total_overlap += max_similarity(context, topic)
                overlaps_count += 1
        
        if overlaps_count > 0:
            average_paragraph_overlap = total_overlap / overlaps_count
            paragraphs_overlap.append((paragraph, average_paragraph_overlap))
    
    # number of paragraphs to mantain after reduction
    paragraphs_num = round(len(paragraphs_overlap) - len(paragraphs_overlap) * reduction)

    # we mantain only the first 'paragraphs_num' paragraphs in order of importance (i.e. average overlap)
    reduced_paragraphs = sorted(paragraphs_overlap, key=lambda x: x[1], reverse=True)[:paragraphs_num]
    reduced_paragraphs = [p[0] for p in reduced_paragraphs] # remove scores associated

    summary = list()
    summary.append(doc[0]) # append title
    for paragraph in reduced_paragraphs:
        summary.append(paragraph) # append reduced paragraphs

    return summary


In [90]:
class RelevanceCriteria(Enum):
    title = get_title_topics
    topic = get_topics

docs = os.listdir('data/docs')
doc = parse_doc(select_doc(docs))
reduction = int(input('Enter the reduction % (10, 20, 30): ')) / 100
relevance_criteria = RelevanceCriteria.topic

summary = summarization(doc, reduction, relevance_criteria)
summary

Available documents to summarize:
[0] Andy-Warhol.txt
[1] Ebola-virus-disease.txt
[2] Life-indoors.txt
[3] Napoleon-wiki.txt
[4] Trump-wall.txt
before reduction: 20
after reduction: 15


['Ebola virus disease',
 'Symptoms usually begin with a sudden influenza-like stage characterised by feeling tired, fever, weakness, decreased appetite, muscular pain, joint pain, headache, and sore throat. The fever is usually higher than 38.3 Â°C (101 Â°F). This is often followed by nausea, vomiting, diarrhoea, abdominal pain, and sometimes hiccups. The combination of severe vomiting and diarrhoea often leads to severe dehydration. Next, shortness of breath and chest pain may occur, along with swelling, headaches, and confusion. In about half of the cases, the skin may develop a maculopapular rash, a flat red area covered with small bumps, five to seven days after symptoms begin.',
 'The potential for widespread infections in countries with medical systems capable of observing correct medical isolation procedures is considered low. Usually when someone has symptoms of the disease, they are unable to travel without assistance.',
 'In some cases, internal and external bleeding may occu