# <span style="color:#0077b6"> <center> Text Mining and Search - AA 2020/2021 </center>

## <span style="color:#0077b6"> <center> TextRank </center>

> <span style="color:#00b4d8">**Studente**:</span> Campironi Matteo
>
> <span style="color:#00b4d8">**Matricola**:</span> 801850

> <span style="color:#00b4d8">**Studente**:</span> Di Maggio Serena
>
> <span style="color:#00b4d8">**Matricola**:</span> 821063

## Importo librerie necessarie

In [1]:
import pandas as pd
import numpy as np
import os
import re
import unicodedata
import matplotlib.pyplot as plt

In [2]:
from rouge import Rouge 
import networkx as nx
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import contractions

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
rouge = Rouge()
WHITE_SPACE_PATTERN = re.compile(r' +')

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

## Definisco funzioni 

In [3]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def stringPreprocessing(text):
    text = text.lower() #lowercase
    text = contractions.fix(text) #contractions
    text = re.sub(r'[^\w\s]', '', text) #punctuation
    text = re.sub(r'\d', '', text) #numbers
    text = re.sub(WHITE_SPACE_PATTERN, ' ', text.strip()) #whitespaces
    text = lemmatize_sentence(text) #lemmatization
    tokenizedText = word_tokenize(text) #tokenize
    finalText= [item for item in tokenizedText if item not in stop_words] #stopwords
    finalText = ' '.join(map(str, finalText))
    
    return finalText

In [4]:
def TextRank(article):
    #matrice coseno
    sentences = sent_tokenize(article)
    sentencesPP = [stringPreprocessing(sentence) for sentence in sentences]
    tfidfMat = tfidfVectorizerFit.transform(sentencesPP)
    cosMatrix = cosine_similarity(tfidfMat)
    np.fill_diagonal(cosMatrix, 0)
    
    #textrank
    nx_graph = nx.from_numpy_array(cosMatrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    summary = []
    
    for i in range(len(ranked_sentences)):
        summary.append(ranked_sentences[:][i][1])   
    
    return summary

def MMR(sentence, top_summary, article, l=0.5):

    sentence = stringPreprocessing(sentence)
    
    sentence = tfidfVectorizerFit.transform([sentence])
    article = tfidfVectorizerFit.transform([article])
    
    sim1 = cosine_similarity(sentence, article)
    
    sim2 = []
    for sent in top_summary:
        sent = stringPreprocessing(sent)
        sent = tfidfVectorizerFit.transform([sent])
        sim2.append(cosine_similarity(sentence, sent))
        
    mmr = l*sim1 + (1-l)*max(sim2)
    
    return mmr[0][0]

## Carico il dataset

In [5]:
df = pd.read_csv("df.csv")
df

Unnamed: 0,article,summary,articlePP
0,As ash from Chile's Calbuco Volcano spread eas...,Volcano already has erupted twice this week. I...,ash chile calbuco volcano spread east argentin...
1,Baltimore Ravens star running back Ray Rice wa...,Baltimore Raven running back Ray Rice is indic...,baltimore raven star run back ray rice indict ...
2,"Nine years after ""Bruce Almighty,"" Universal i...",EW has confirmed that Universal has plans to r...,nine year bruce almighty universal plot second...
3,Former Italian Prime Minister Silvio Berluscon...,"A judge in Italy indicts Sergio Berlusconi, ac...",former italian prime minister silvio berluscon...
4,"Despite most humans' land-centric view, Earth ...",Oceans make life on Earth possible providing o...,despite human landcentric view earth ocean pla...
...,...,...,...
19995,A teenage girl has died after she jumped out o...,Laikyn Field hit the pavement on Saturday when...,teenage girl die jump parent move car argument...
19996,Cash on tap: Scarlet Johansson is being paid 2...,Scarlett Johansson admitted to Mail on Sunday ...,cash tap scarlet johansson pay sodastream appe...
19997,With young children inevitably set to ask a ba...,Fun infographic attempts to explain the scienc...,young child inevitably set ask barrage questio...
19998,Prince Harry proved he is an excellent uncle-t...,Prince was pictured holding the bear as he lan...,prince harry prove excellent uncletobe handdel...


## TextRank

In [6]:
tfidfVectorizer = TfidfVectorizer()
tfidfVectorizerFit = tfidfVectorizer.fit(df['articlePP'])

In [7]:
%%time
summaries = []

for i in range(20000):
    tr_sentence = TextRank(df["article"][i])
    mmr = []
    for sent in tr_sentence[:min(10, len(tr_sentence))]:
        mmr.append(MMR(sent, tr_sentence[:10], df["articlePP"][i], 0.8))
    mmr = np.array(mmr)
    mmr_ind = mmr.argsort()[-4:][::-1]
    mmr_sentence = [tr_sentence[i] for i in mmr_ind]
    summaries.append(' '.join(mmr_sentence))
    
df["summary_tr"] = summaries

CPU times: user 2h 55min 15s, sys: 47.8 s, total: 2h 56min 3s
Wall time: 2h 56min 44s


In [26]:
rouge1 = []
rouge2 = []
rougel = []

for i in range(19999):
    scores = rouge.get_scores(df["summary"][i], df["summary_tr"][i])
    rouge1.append(scores[0]["rouge-1"]["f"])
    rouge2.append(scores[0]["rouge-2"]["f"])
    rougel.append(scores[0]["rouge-l"]["f"])
    
df["rouge1"] = rouge1
df["rouge2"] = rouge2
df["rougel"] = rougel
df

Unnamed: 0,article,summary,articlePP,summary_tr,rouge1,rouge2,rougel
0,As ash from Chile's Calbuco Volcano spread eas...,Volcano already has erupted twice this week. I...,ash chile calbuco volcano spread east argentin...,As ash from Chile's Calbuco Volcano spread eas...,0.350877,0.196429,0.383838
1,Baltimore Ravens star running back Ray Rice wa...,Baltimore Raven running back Ray Rice is indic...,baltimore raven star run back ray rice indict ...,Baltimore Ravens star running back Ray Rice wa...,0.345324,0.160584,0.393162
2,"Nine years after ""Bruce Almighty,"" Universal i...",EW has confirmed that Universal has plans to r...,nine year bruce almighty universal plot second...,"This wouldn't be the first ""Bruce"" sequel; ""Ev...",0.478528,0.360248,0.496350
3,Former Italian Prime Minister Silvio Berluscon...,"A judge in Italy indicts Sergio Berlusconi, ac...",former italian prime minister silvio berluscon...,Former Italian Prime Minister Silvio Berluscon...,0.222222,0.057143,0.184615
4,"Despite most humans' land-centric view, Earth ...",Oceans make life on Earth possible providing o...,despite human landcentric view earth ocean pla...,"But to truly explore ocean trenches, scientist...",0.072917,0.000000,0.076923
...,...,...,...,...,...,...,...
19994,A teenage girl has died after she jumped out o...,Laikyn Field hit the pavement on Saturday when...,teenage girl die jump parent move car argument...,"Tragedy: Laikyn Field, 16, hit the pavement af...",0.478632,0.330435,0.517647
19995,Cash on tap: Scarlet Johansson is being paid 2...,Scarlett Johansson admitted to Mail on Sunday ...,cash tap scarlet johansson pay sodastream appe...,Johansson insists SodaStream provides employme...,0.274510,0.060000,0.243902
19996,With young children inevitably set to ask a ba...,Fun infographic attempts to explain the scienc...,young child inevitably set ask barrage questio...,The total number of presents would set Santa b...,0.604317,0.437956,0.628571
19997,Prince Harry proved he is an excellent uncle-t...,Prince was pictured holding the bear as he lan...,prince harry prove excellent uncletobe handdel...,It comes as Prince Harry's romance with girlf...,0.234234,0.036697,0.214286


In [27]:
df.to_csv("df_TR.csv", index=False)

## Calcolo Rouge medio

In [5]:
rouge.get_scores(df["summary"], df["summary_tr"], avg=True)

{'rouge-1': {'f': 0.2587321047985369,
  'p': 0.4127265486466989,
  'r': 0.19952998429040725},
 'rouge-2': {'f': 0.09129079812971089,
  'p': 0.14763297629325675,
  'r': 0.06986719936517753},
 'rouge-l': {'f': 0.2633981419785787,
  'p': 0.3677089842842674,
  'r': 0.2174226790720997}}

## Esempio di riassunto

In [30]:
df.rouge1[19997]

0.234234229731353

In [31]:
df.summary[19997]

'Prince was pictured holding the bear as he landed at Kensington Palace. Promised girl he would pass on bear to the Duke and Duchess of Cambridge. Was given the present during visit to brain injury charity in Nottingham'

In [32]:
df.summary_tr[19997]

"It comes as Prince Harry's romance  with girlfriend Cressida Bonas is reportedly 'on the rocks' because she  is not ready to marry. 'On the rocks': Prince Harry's romance with girlfriend Cressida Bonas is reportedly in trouble. A friend told the paper that Harry wants to get married and settle down with Cressida. True to his word: Prince Harry pictured holding the blue bear he promised to give to his unborn niece or nephew."