# <span style="color:#0077b6"> <center> Text Mining and Search - AA 2020/2021 </center>

## <span style="color:#0077b6"> <center> LSA </center>

> <span style="color:#00b4d8">**Studente**:</span> Campironi Matteo
>
> <span style="color:#00b4d8">**Matricola**:</span> 801850

> <span style="color:#00b4d8">**Studente**:</span> Di Maggio Serena
>
> <span style="color:#00b4d8">**Matricola**:</span> 821063

## Importo le librerie necessarie

In [1]:
import pandas as pd
import numpy as np
import os
import re
import unicodedata
import matplotlib.pyplot as plt
import nltk
from nltk import sent_tokenize, word_tokenize
from nltk.cluster.util import cosine_distance
from operator import itemgetter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

import string
import pickle
import contractions
from collections import Counter
from sklearn.decomposition import TruncatedSVD 
from rouge import Rouge 

In [2]:
rouge = Rouge()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
WHITE_SPACE_PATTERN = re.compile(r' +')

## Definisco funzioni utili

In [3]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

def stringPreprocessing(text):
    text = text.lower() #lowercase
    text = contractions.fix(text) #contractions
    text = re.sub(r'[^\w\s]', '', text) #punctuation
    text = re.sub(r'\d', '', text) #numbers
    text = re.sub(WHITE_SPACE_PATTERN, ' ', text.strip()) #whitespaces
    text = lemmatize_sentence(text) #lemmatization
    tokenizedText = word_tokenize(text) #tokenize
    finalText= [item for item in tokenizedText if item not in stop_words] #stopwords
    finalText = ' '.join(map(str, finalText))
    
    return finalText

##  LSA

In [4]:
def LSA(article):
    sentences = sent_tokenize(article)
    sentencesPP = [stringPreprocessing(sentence) for sentence in sentences]
    
    tfidf_matrix = tfidf_vectorizer_fit.transform(sentencesPP) #document-term matrix
    
    svd = TruncatedSVD(10, algorithm='randomized', n_iter=5, random_state=123)
    lsa = svd.fit_transform(tfidf_matrix)
    sigma = svd.singular_values_
    
    column_names = ["Topic {}".format(str(i)) for i in range(lsa.shape[1])]
    sent_topic_matrix = pd.DataFrame(lsa,columns=column_names)
    sent_topic_matrix["Document"] = sentences#PP
    sent_topic_matrix["Position"] = [i for i in range(len(sentences))]
    dic = tfidf_vectorizer.get_feature_names()
    term_topic_matrix = pd.DataFrame(svd.components_, index = column_names, columns = (dic)).T
    
    return sigma, sent_topic_matrix, term_topic_matrix, lsa

In [5]:
def summarize(sigma, sent_topic_matrix):
    summary = []
    column_names = ["Topic {}".format(str(i)) for i in range(len(sigma))]
    for i in range(len(column_names)):
        topic = column_names[i]
        sent_topic_matrix.sort_values(by = topic,inplace = True,ascending = False)
        sent_topic_matrix.reset_index(drop = True, inplace = True)
        item = (sent_topic_matrix["Document"][0],sent_topic_matrix["Position"][0])
        if item not in summary:
            summary.append(item)
                
    summary.sort(key = lambda x: x[1]) #ordino per valore di Position
    sent = [i[0] for i in summary] #considero solo le frasi 
    #return " ".join(sent) 
    return sent

def MMR(sentence, top_summary, article, l=0.5):

    sentence = stringPreprocessing(sentence)
    
    sentence = tfidf_vectorizer_fit.transform([sentence])
    article = tfidf_vectorizer_fit.transform([article])
    
    sim1 = cosine_similarity(sentence, article)
    
    sim2 = []
    for sent in top_summary:
        sent = stringPreprocessing(sent)
        sent = tfidf_vectorizer_fit.transform([sent])
        sim2.append(cosine_similarity(sentence, sent))
        
    mmr = l*sim1 + (1-l)*max(sim2)
    
    return mmr[0][0]

## Carico il dataset

In [6]:
df = pd.read_csv("df.csv")
df

Unnamed: 0,article,summary,articlePP
0,As ash from Chile's Calbuco Volcano spread eas...,Volcano already has erupted twice this week. I...,ash chile calbuco volcano spread east argentin...
1,Baltimore Ravens star running back Ray Rice wa...,Baltimore Raven running back Ray Rice is indic...,baltimore raven star run back ray rice indict ...
2,"Nine years after ""Bruce Almighty,"" Universal i...",EW has confirmed that Universal has plans to r...,nine year bruce almighty universal plot second...
3,Former Italian Prime Minister Silvio Berluscon...,"A judge in Italy indicts Sergio Berlusconi, ac...",former italian prime minister silvio berluscon...
4,"Despite most humans' land-centric view, Earth ...",Oceans make life on Earth possible providing o...,despite human landcentric view earth ocean pla...
...,...,...,...
19995,A teenage girl has died after she jumped out o...,Laikyn Field hit the pavement on Saturday when...,teenage girl die jump parent move car argument...
19996,Cash on tap: Scarlet Johansson is being paid 2...,Scarlett Johansson admitted to Mail on Sunday ...,cash tap scarlet johansson pay sodastream appe...
19997,With young children inevitably set to ask a ba...,Fun infographic attempts to explain the scienc...,young child inevitably set ask barrage questio...
19998,Prince Harry proved he is an excellent uncle-t...,Prince was pictured holding the bear as he lan...,prince harry prove excellent uncletobe handdel...


In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer_fit = tfidf_vectorizer.fit(df['articlePP'])

In [8]:
%%time
summaries = []

for i in range(20000):
    sigma, sent_topic_matrix, term_topic_matrix, lsa = LSA(df['article'][i])
    summary = summarize(sigma,sent_topic_matrix)
    mmr = []
    for sent in summary[:min(10, len(summary))]:
        mmr.append(MMR(sent, summary[:10], df["articlePP"][i], 0.5))
    mmr = np.array(mmr)
    mmr_ind = mmr.argsort()[-4:][::-1]
    mmr_sentence = [summary[i] for i in mmr_ind]
    summaries.append(' '.join(mmr_sentence))
    
df["summary_tr"] = summaries

  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var
  self.explained_variance_ratio_ = exp_var / full_var


CPU times: user 8h 34min 39s, sys: 4h 59min 47s, total: 13h 34min 27s
Wall time: 5h 53min 12s


In [15]:
rouge1 = []
rouge2 = []
rougel = []

for i in range(19999):
    scores = rouge.get_scores(df["summary"][i], df["summary_tr"][i])
    rouge1.append(scores[0]["rouge-1"]["f"])
    rouge2.append(scores[0]["rouge-2"]["f"])
    rougel.append(scores[0]["rouge-l"]["f"])
    
df["rouge1"] = rouge1
df["rouge2"] = rouge2
df["rougel"] = rougel
df

Unnamed: 0,article,summary,articlePP,summary_tr,rouge1,rouge2,rougel
0,As ash from Chile's Calbuco Volcano spread eas...,Volcano already has erupted twice this week. I...,ash chile calbuco volcano spread east argentin...,As ash from Chile's Calbuco Volcano spread eas...,0.178218,0.000000,0.175824
1,Baltimore Ravens star running back Ray Rice wa...,Baltimore Raven running back Ray Rice is indic...,baltimore raven star run back ray rice indict ...,"Rice's attorney, Michael Diamondstein, could n...",0.211538,0.019608,0.208333
2,"Nine years after ""Bruce Almighty,"" Universal i...",EW has confirmed that Universal has plans to r...,nine year bruce almighty universal plot second...,"This wouldn't be the first ""Bruce"" sequel; ""Ev...",0.466258,0.360248,0.478261
3,Former Italian Prime Minister Silvio Berluscon...,"A judge in Italy indicts Sergio Berlusconi, ac...",former italian prime minister silvio berluscon...,"Silvio Berlusconi, so long!. Because of his ag...",0.109890,0.000000,0.075000
4,"Despite most humans' land-centric view, Earth ...",Oceans make life on Earth possible providing o...,despite human landcentric view earth ocean pla...,And it's not just the trenches that need to be...,0.125654,0.042328,0.111111
...,...,...,...,...,...,...,...
19994,A teenage girl has died after she jumped out o...,Laikyn Field hit the pavement on Saturday when...,teenage girl die jump parent move car argument...,"Tragedy: Laikyn Field, 16, hit the pavement af...",0.369863,0.097222,0.336134
19995,Cash on tap: Scarlet Johansson is being paid 2...,Scarlett Johansson admitted to Mail on Sunday ...,cash tap scarlet johansson pay sodastream appe...,The charity insists it is incompatible for Joh...,0.174603,0.000000,0.166667
19996,With young children inevitably set to ask a ba...,Fun infographic attempts to explain the scienc...,young child inevitably set ask barrage questio...,The total number of presents would set Santa b...,0.358025,0.212500,0.355556
19997,Prince Harry proved he is an excellent uncle-t...,Prince was pictured holding the bear as he lan...,prince harry prove excellent uncletobe handdel...,A friend told the paper that Harry wants to ge...,0.313043,0.053097,0.298851


In [16]:
df.to_csv("df_LSA.csv", index=False)

## Calcolo Rouge medio

In [6]:
rouge.get_scores(df["summary"], df["summary_tr"], avg=True)

{'rouge-1': {'f': 0.2523573592410816,
  'p': 0.3836043218419657,
  'r': 0.2001420237561898},
 'rouge-2': {'f': 0.08181539976081688,
  'p': 0.12685501675886104,
  'r': 0.06426629355976811},
 'rouge-l': {'f': 0.24591101391376807,
  'p': 0.34629038904391,
  'r': 0.20085598389669282}}