## Extraktionsbaserad textsammanfattare med olika rankningsmått 

In [3]:
# Imports
import pandas as pd
from sklearn.preprocessing import StandardScaler
import numpy as np
import nltk 
import ssl

# Fixes some errors, found online at https://github.com/gunthercox/ChatterBot/issues/930#issuecomment-322111087
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [4]:
# Web scrapping -->  module för att ladda ner artiklar 
from newspaper import Article
text = "https://www.aftonbladet.se/nyheter/a/kE6ExL/sd-far-tunga-poster-i-utskotten"#'https://www.svt.se/nyheter/utrikes/stall-dina-fragor-om-kriget-till-svt-s-utrikesreportrar'
article = Article(text, language='sv')
article.download()
article.parse()
#article.nlp()
#article.summary

text = article.text

# Beroende på vilken hemsida nyheten kommer ifrån kan titeln och texten inehålla delar av sidan man egentligen inte bryr sig om
# T.ex. från aftonbladet är titeln med i texten och texten innehåller en mening som: "publicerad: 30 sep", man kan ta bort detta men 
# det blir om vi får tid över.
print('Title:' , article.title, '\n\nText: \n', text)

Title: SD får tunga poster i utskotten 

Text: 
 SD får tunga poster i utskotten

Publicerad: 30 september Uppdaterad: 30 september

Sverigedemokraterna får ordförandeposten i riksdagens justitie- och utrikesutskott.

Nu går ledarna för vänsterblocket till hård attack.

– Det är skrämmande, ganska chockartat, säger Socialdemokraternas gruppledare Lena Hallengren till Aftonbladet.

Sverigedemokraterna , Moderaterna, Kristdemokraterna och Liberalerna har delat upp posterna i utskotten och EU-nämnden.

Där tar Sverigedemokraterna flera viktiga poster.

Bland annat tilldelas partiet ordförandeposten i arbetsmarknadsutskottet, näringsutskottet, justitieutskottet samt utrikesutskottet, enligt ett pressmeddelande.

De erhåller även posten som vice ordförande i civilutskottet, trafikutskottet, försvarsutskottet samt skatteutskottet.

– Det som överraskade mig mest, men som jag kan se varför de vill ha, är ordförandeposten i utrikesutskottet. Det är ett tecken på att de lyckats i förhandlingen 

# Preprocessing 

## Overview
### List 1 - sentences
* Varje mening separat

### Dataframe - scores
#### columns are the score of each ranking measure
* Baseline
* Headings
* TF/IDF-score
* NER
* ~~Class~~ //Om vi har tid för ML

### List 2 -> Cleaned for Stop Words 
* Varje mening separat 

###

##### List 1

In [47]:
# removes endlines:
from token import NEWLINE


org_sentences = text.replace('\n\n', '. ')
# creates some exceptions from above rule
org_sentences = org_sentences.replace('.. ', '. ')
org_sentences = org_sentences.replace(':. ', ': ')
org_sentences = org_sentences.split('. ')

org_sentences[0:5]

['SD får tunga poster i utskotten',
 'Publicerad: 30 september Uppdaterad: 30 september',
 'Sverigedemokraterna får ordförandeposten i riksdagens justitie- och utrikesutskott',
 'Nu går ledarna för vänsterblocket till hård attack',
 '– Det är skrämmande, ganska chockartat, säger Socialdemokraternas gruppledare Lena Hallengren till Aftonbladet']

##### Dataframe

In [6]:
index = range(org_sentences.__len__())
columns = ['Baseline', 'Headings', 'TF/IDF', 'NER']
scores = pd.DataFrame(index=index, columns=columns)
scores.fillna(0, inplace=True)
scores.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Baseline  34 non-null     int64
 1   Headings  34 non-null     int64
 2   TF/IDF    34 non-null     int64
 3   NER       34 non-null     int64
dtypes: int64(4)
memory usage: 1.2 KB


### Stop Word Filtering
* Hur stor korpus ska vi ha? 

In [7]:
from nltk.corpus import stopwords
#nltk.download('stopwords')

swe_stop_words = stopwords.words('swedish')
print(len(swe_stop_words))

# Snowball Swedish Stop Words
snowball_sw = pd.read_csv("resources/stop_words_swedish_snowball .txt")
# print(snowball_sw)

# Borrowed from https://github.com/peterdalle/svensktext/tree/master/stoppord
def get_stopwords(wordlist = "standard"):
    if wordlist == "standard":
        url = "https://raw.githubusercontent.com/peterdalle/svensktext/master/stoppord/stoppord.csv"
    elif wordlist == "many":
        url = "https://raw.githubusercontent.com/peterdalle/svensktext/master/stoppord/stoppord-mycket.csv"
    elif wordlist == "politics":
        url = "https://raw.githubusercontent.com/peterdalle/svensktext/master/stoppord/stoppord-politik.csv"
    else:
        raise ValueError("Argument 'wordlist' must be 'standard', 'many' or 'politics', not '{}'.".format(wordlist))
    return pd.read_csv(url, header=1, encoding="utf-8")

stopwords = get_stopwords()
# print(stopwords)

114


In [8]:
# Perform Stop Word Filtering On Example (Original Text) 
def stop_word_filtering(original_text): 
    swf_text = None
    return swf_text

### Lemmatizer & NER 

* LEmmatizer verkar ok 
* NER --> helt klart bristande -- men det kan duga? 

# REMOVE DUPLICATE ENTITIES 

In [64]:
# Spacy NLP Pipeline
# spaCy + Lemmy (https://github.com/sorenlind/lemmy) ??? --> Testa om det blir bättre täckning (recall)? 
from enum import unique
from html import entities
import spacy
#from spacy.lang.sv.examples import sentences 

# Credit to Explosion for sv_core_news_sm --> https://github.com/explosion 

nlp = spacy.load("sv_core_news_sm")

doc = nlp(' '.join(org_sentences))


#print(doc.text)
#print(doc.ents) # --> KANSKE ATT DET STORA PAKETET ÄR BÄTTRE
named_entities = doc.ents.__str__()
print(type(named_entities))

for token in doc:
    #print(token)
    #print(token.text, token.pos_, token.dep_)
    #print(token.lemma_)
    pass

<class 'str'>


In [10]:
# Lemmatizer 
# Stanza https://stanfordnlp.github.io/stanza/installation_usage.html + Språkbanken https://spraakbanken.gu.se/en/resources/stanzalem
# https://nlp.johnsnowlabs.com/2020/05/05/lemma_sv.html

# Ranking Measures

### Baseline
(1/N --> n=ordning mening kommer i dvs. första meningen får N=1 -> 1/1, andra meningen får N=2 -> 1/2, osv.)

In [11]:
# Ranking metric 1 --> Baseline
for i, score in enumerate(scores['Baseline']) :
    scores['Baseline'][i] = 1/((i+1))
scores.describe()

Unnamed: 0,Baseline,Headings,TF/IDF,NER
count,34.0,34.0,34.0,34.0
mean,0.121124,0.0,0.0,0.0
std,0.183991,0.0,0.0,0.0
min,0.029412,0.0,0.0,0.0
25%,0.038846,0.0,0.0,0.0
50%,0.05719,0.0,0.0,0.0
75%,0.108333,0.0,0.0,0.0
max,1.0,0.0,0.0,0.0


### Headings

In [30]:
# Ranking metric 2 --> Headings
# Sets all 'Headings' scores to 0, mostly for testing so i can run this multiple times, 
# but also to make sure nothing weird has happened earlier in the code.
scores['Headings'] = 0
for i, sentence in enumerate(org_sentences):
    for word in article.title.split(' '):
        if word in sentence:
            scores.at[i, 'Headings'] += 1
scores.describe()

Unnamed: 0,Baseline,Headings,TF/IDF,NER
count,34.0,34.0,34.0,34.0
mean,0.121124,1.352941,0.0,0.0
std,0.183991,1.01152,0.0,0.0
min,0.029412,0.0,0.0,0.0
25%,0.038846,1.0,0.0,0.0
50%,0.05719,1.0,0.0,0.0
75%,0.108333,1.0,0.0,0.0
max,1.0,6.0,0.0,0.0


### TF*IDF-score
* Diskutera hur vi kan använda måtten 
* If similarity is close --> Similar content --> Remove redundance?  



https://forketyfork.medium.com/latex-math-formulas-a-cheat-sheet-21e5eca70aae

In [13]:
# Ranking metric 3 --> TF*IDF

# Term Weights --> Calculate importance of single words in text/doc
# Binary term weights --> document specific
# TF*IDF term weights --> document-collection specific 

# Assign weights to each dimension (attr/word) of each sentence (record/example) 

# Term Frequency (TF-score) --> TFij == frequency of the jth term in in the ith doc 

# Inverse Document Frequency 
# idf-score of the jth term measures the uniqueness of the jth term in the collection of documents
# IDFj = log(M / Nj)
#
# M = total num of docs in collection 
# Nj is the number of documents that contain the jth term

# HIGH TF*IDF-score 
# Word frequent in document && Occur in few documents of the collection 
# LOW TF*IDF-score
# Not present in document || present in all documents of the collection 

### NER 
* (nltk lib) --> (Meningar med Named Entities är troligtvis viktigare)

In [71]:
scores['NER'] = 0

# Hittar inte allt p.g.a sentece.split(' ') där den splittar namn som t.ex "ulf kristensson" till "ulf" och "kristensson"
for i, sentence in enumerate(org_sentences) :
    for word in sentence.split(' '):
        if word in named_entities.split(' '):
            print(i, word)
            scores.at[i, "NER"] += 1

scores.describe()


4 Lena
5 Moderaterna,
10 KD,
10 Aftonbladets
10 My
16 Ulf
16 Lena
16 Hallengren,
25 Martin
25 Markus
26 Marcus
26 Martin
26 Kinnunen,
31 Jakob


Unnamed: 0,Baseline,Headings,TF/IDF,NER
count,34.0,34.0,34.0,34.0
mean,0.121124,1.352941,0.0,0.411765
std,0.183991,1.01152,0.0,0.924995
min,0.029412,0.0,0.0,0.0
25%,0.038846,1.0,0.0,0.0
50%,0.05719,1.0,0.0,0.0
75%,0.108333,1.0,0.0,0.0
max,1.0,6.0,0.0,3.0


# Combination Function

## Overview
### Standardize
* standardize all scores
### Combine
* Combine the scores into one overall score
* add weight and/or ML if time allows

In [70]:
# Standardize
scores_standardized = StandardScaler.fit_transform(self=StandardScaler(), X=scores)
scores_standardized = pd.DataFrame(scores_standardized, columns=columns)
scores_standardized

Unnamed: 0,Baseline,Headings,TF/IDF,NER
0,4.848572,4.663223,0.0,-0.451848
1,2.090179,-0.354169,0.0,-0.451848
2,1.170715,0.649309,0.0,-0.451848
3,0.710982,-0.354169,0.0,-0.451848
4,0.435143,-0.354169,0.0,0.645497
5,0.25125,1.652788,0.0,0.645497
6,0.119898,0.649309,0.0,-0.451848
7,0.021384,-0.354169,0.0,-0.451848
8,-0.055238,-0.354169,0.0,-0.451848
9,-0.116536,-0.354169,0.0,-0.451848


In [16]:
# Combination Function
# Här ligger ML om vi gör det  



# Assemble output 
* Reassemble according to overall score ranking
* Output summarization 

In [17]:
# Assemble Output 

In [18]:
# Run 