## Extraktionsbaserad textsammanfattare med olika rankningsmått 

#### Imports

In [None]:
# Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk 
import ssl
import re 

# Summarization length of original text
percentage = 0.15

# Fixes some errors, found online at https://github.com/gunthercox/ChatterBot/issues/930#issuecomment-322111087
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#### Input

In [None]:
# Web scrapping -->  module för att ladda ner artiklar 
from newspaper import Article
text = "https://www.aftonbladet.se/nyheter/a/kE6ExL/sd-far-tunga-poster-i-utskotten"#'https://www.svt.se/nyheter/utrikes/stall-dina-fragor-om-kriget-till-svt-s-utrikesreportrar'
article = Article(text, language='sv')
article.download()
article.parse()
text = article.text

# Beroende på vilken hemsida nyheten kommer ifrån kan titeln och texten inehålla delar av sidan man egentligen inte bryr sig om
# T.ex. från aftonbladet är titeln med i texten och texten innehåller en mening som: "publicerad: 30 sep", man kan ta bort detta men 
# det blir om vi får tid över.

#print('Title:' , article.title, '\n\nText: \n', text)

# Preprocessing 

## Overview
### Calculate number of sentences to keep

### List 1 - sentences
* Varje mening separat

### Dataframe - scores
#### columns are the score of each ranking measure
* Baseline
* Headings
* TF/IDF-score
* NER
* ~~Class~~ //Om vi har tid för ML

### List 2 -> Cleaned for Stop Words 
* Varje mening separat 

###

##### Original Sentences List

In [None]:
# removes endlines:
from token import NEWLINE

org_sentences = text.replace('\n\n', '. ')
# creates some exceptions from above rule
org_sentences = org_sentences.replace('.. ', '. ')
org_sentences = org_sentences.replace(':. ', ': ')
org_sentences = org_sentences.split('. ')

org_sentences[0:5]

##### Dataframe

In [None]:
index = range(org_sentences.__len__())
columns = ['Baseline', 'Headings', 'TF', 'NER']
scores = pd.DataFrame(index=index, columns=columns)
scores.fillna(0, inplace=True)
scores.info()

#### Create spacy doc object

In [None]:
import spacy
# Credit to Explosion for sv_core_news_sm --> https://github.com/explosion 
# "lemmatization accuracy 0.95"
# Create spacy nlp object 
nlp = spacy.load("sv_core_news_sm") # nlp used by lemmatizer()

#### Lemmatizer

In [None]:
# den --> det 
# noterar detta att det är en dålig lemmatiserare --> språkbanken stanza / lemmy / kth...  

def lemmatizer(list_of_strings):
    lemmatized_sentences = []
    lemmatized_sentence = ''
    for i in range(len(list_of_strings)): 
        sentence_to_lemmatize = nlp(list_of_strings[i])
        for token in sentence_to_lemmatize:
            lemma = token.lemma_
            lemmatized_sentence += lemma + ' '  
        
        lemmatized_sentences.append(lemmatized_sentence)
        lemmatized_sentence = '' 

    return lemmatized_sentences

# Created Lemmatized DS
lemmatized_org_sentences = lemmatizer(org_sentences)
print(lemmatized_org_sentences)

#### Proper Nouns

In [None]:
# För bättre täckning på NER 
def proper_nouns(list_of_strings):
    proper_nouns = set()
    for i in range(len(list_of_strings)): 
        sentence_to_pos = nlp(list_of_strings[i])
        for token in sentence_to_pos: 
            token_str = token.text
            if token.pos_ == "PROPN" and len(token_str) > 1:
                proper_nouns.add(token_str.strip())
    return proper_nouns

print(proper_nouns(org_sentences))


#### Named Entities

In [None]:
def named_entity_recognition(list_of_strings):
    doc = nlp(' '.join(list_of_strings))
    # Convert tuple[Span] to str
    named_entities = doc.ents.__str__()
    # Remove string parenthesis 
    named_entities = named_entities[1:len(named_entities) - 1]
    # Create list of strings
    named_entities = named_entities.split(',')
    
    named_entities_set = set()
    for entity in named_entities: 
        named_entities_set.add(entity.strip())
    return named_entities_set

### Stop Word Filtering

In [None]:
# Inspired by https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

def get_swe_stop_words(): 
    swe_stop_words = set(stopwords.words('swedish'))
    # Not stop word cleaning 
    swe_stop_words.update([',', '"', ':', '-', '–', '”'])
    return swe_stop_words

def stop_word_filtering(list_of_strings):
    word_tokens = word_tokenize(' '.join(list_of_strings))
    filtered_sentence_into_singletons = [w for w in word_tokens if not w.lower() in get_swe_stop_words()]
    return filtered_sentence_into_singletons

#### Frequency Distribution 

In [None]:
def frequency_distribution(list): 
    fdist = FreqDist(word.lower() for word in word_tokenize(' '.join(list)))
    return fdist

In [None]:
filtered_words = stop_word_filtering(lemmatized_org_sentences) 

# Ranking Measures

### Baseline
(1/N --> n=ordning mening kommer i dvs. första meningen får N=1 -> 1/1, andra meningen får N=2 -> 1/2, osv.)

In [None]:
# Ranking metric 1 --> Baseline
for i, score in enumerate(scores['Baseline']) :
    scores['Baseline'][i] = 1/((i+1))
scores.describe()

### Headings

In [None]:
# Ranking metric 2 --> Headings
# Sets all 'Headings' scores to 0, mostly for testing so i can run this multiple times, 
# but also to make sure nothing weird has happened earlier in the code.
scores['Headings'] = 0
for i, sentence in enumerate(org_sentences):
    for word in article.title.split(' '):
        if word in sentence:
            scores.at[i, 'Headings'] += 1
scores.describe()

## TERM FREQUENCY 


In [None]:
scores['TF'] = 0

fdist = frequency_distribution(filtered_words)

for i, sentence in enumerate(lemmatized_org_sentences):
    for word in sentence.split(' '):
        #word = word.lemma
        if word in fdist.keys():
            scores.at[i, "TF"] += fdist.get(word)

#scores.describe()
#scores['TF'].idxmax()
#scores.head
#print(org_sentences[32])

### TF*IDF-score
* Diskutera hur vi kan använda måtten 
* If similarity is close --> Similar content --> Remove redundance?  



https://forketyfork.medium.com/latex-math-formulas-a-cheat-sheet-21e5eca70aae

In [None]:
# Ranking metric 3 --> TF*IDF

# Term Weights --> Calculate importance of single words in text/doc
# Binary term weights --> document specific
# TF*IDF term weights --> document-collection specific 

# Assign weights to each dimension (attr/word) of each sentence (record/example) 

# Term Frequency (TF-score) --> TFij == frequency of the jth term in in the ith doc 

# Inverse Document Frequency 
# idf-score of the jth term measures the uniqueness of the jth term in the collection of documents
# IDFj = log(M / Nj)
#
# M = total num of docs in collection 
# Nj is the number of documents that contain the jth term

# HIGH TF*IDF-score 
# Word frequent in document && Occur in few documents of the collection 
# LOW TF*IDF-score
# Not present in document || present in all documents of the collection 

### NER 
* (nltk lib) --> (Meningar med Named Entities är troligtvis viktigare)

In [None]:
scores['NER'] = 0
named_entities = named_entity_recognition(org_sentences)
proper_nouns = proper_nouns(org_sentences)
ner_unique = named_entities.union(proper_nouns)
print(ner_unique)


In [None]:
# Borrowed from https://stackoverflow.com/questions/33406313/how-to-match-any-string-from-a-list-of-strings-in-regular-expressions-in-python
#p = re.compile(r"\L<words>", words=['fun', 'dum', 'sun', 'gum'])

print(named_entities)
for i, sentence in enumerate(org_sentences):
    print(sentence)

    matches = re.findall(r"(?=(\b" + '|'.join(ner_unique) + r"\b))", sentence) 
    print(matches)
    scores.at[i, "NER"] = len(matches)

In [None]:
#!/usr/bin/env python
import regex as rex # $ pip install regex

p = rex.compile(r"\L<words>", words=ner_unique)

for i, sentence in enumerate(org_sentences):
    matches = p.findall(sentence)
    scores.at[i, "NER"] = len(matches)

scores.describe()
scores.head()


# Combination Function

#### Standardize

In [None]:
# Standardize
scores_standardized = StandardScaler.fit_transform(self=StandardScaler(), X=scores)
scores_standardized = pd.DataFrame(scores_standardized, columns=columns)
scores_standardized

#### Calculate Summarization Length (number of sentences)

In [None]:
num_of_org_sentences = len(org_sentences)
summarization_num_sentences = round(num_of_org_sentences * percentage)

print("summarization: ", summarization_num_sentences, "\noriginal: ", num_of_org_sentences)

#### Combine
* Combine the scores into one overall score
* add weight and/or ML if time allows

In [None]:
# Combination Function
# Här ligger ML om vi gör det  

final_score = scores_standardized.sum(axis=1)
best_sentences = final_score.nlargest(summarization_num_sentences, keep='all').index.values
print(best_sentences)

print(org_sentences[0])


# Assemble output 
* Reassemble according to overall score ranking
* Output summarization 

In [None]:
# Assemble Output 
print("Percentage:\n")
for i in best_sentences: 
    print(org_sentences[i])

print("\n\n")

print("N sentences:\n")
for i in best_sentences[0:3]: 
    print(org_sentences[i])

# KANSKE GÖRA FIL ELLER DYLIKT TODO 

In [None]:
# Newspaper Summarization
article.nlp()
print("\nNewspaper3k: \n", article.summary)