# Extractive summarization

## Using NLTK

In [1]:

import nltk
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize




In [2]:
# Load Packages
from sumy.parsers.plaintext import PlaintextParser
from sumy.summarizers.text_rank import TextRankSummarizer

from sumy.nlp.tokenizers import Tokenizer





In [3]:
## Sumy library
def text_rank (text):
    parser = PlaintextParser.from_string(text,Tokenizer("english"))

    # Summarize using sumy TextRank
    summarizer = TextRankSummarizer()
    summary =summarizer(parser.document,1)
    text_summary=""

    for sentence in summary:
        text_summary +=str(sentence)

    return (text_summary)

In [4]:
## using NLTK
def nltk_extractive_sumy(text):

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text)

    freqTable = dict()  ## Frequencia das palavras
    for word in words:
        word = word.lower()
        if word in stopWords:
            continue
        if word in freqTable:   
            freqTable[word] += 1
        else:
            freqTable[word] = 1



    sentences = sent_tokenize(text)  
    sentenceValue = dict() ## Importancia das frases

    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
              if sentence in sentenceValue:
                sentenceValue[sentence] += freq
              else:
                sentenceValue[sentence] = freq

    summary = ''

    sentenceValue = dict(sorted(sentenceValue.items(), key=lambda item: item[1], reverse=True))
   
    for sentence in sentenceValue:

        if len(summary)<230:
        #and (sentenceValue[sentence] > (1.40 * average)):
            summary += " " + sentence

    return (summary)


In [5]:
# Import the summarizer
from sumy.summarizers.lsa import LsaSummarizer

# Parsing the text string using PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser


def LSA_extractive_sumy (original_text):
    parser=PlaintextParser.from_string(original_text,Tokenizer('english'))
    lista = []
    # creating the summarizer
    lsa_summarizer=LsaSummarizer()
    lsa_summary= lsa_summarizer(parser.document,1)
    for sentence in lsa_summary:
        lista.append(sentence)

    return lista

In [8]:
# Import the summarizer
from sumy.summarizers.luhn import LuhnSummarizer


from sumy.nlp.tokenizers import Tokenizer
from sumy.parsers.plaintext import PlaintextParser

def LuhnSum (original_text):
    parser=PlaintextParser.from_string(original_text,Tokenizer('english'))

    #  Creating the summarizer
    luhn_summarizer=LuhnSummarizer()
    luhn_summary=luhn_summarizer(parser.document,sentences_count=1)

    # Printing the summary
    for sentence in luhn_summary:
        print(sentence)

    return 0

In [9]:

with open("../../../extractors/Snopes/example_1.json") as f:
            d = json.load(f)
            f.close()
text = ' '.join(d['postText'])

textRank_sumy = text_rank(text) 
textRank_nltk = nltk_extractive_sumy(text)
print(textRank_sumy)
print (textRank_nltk)


lsa = LSA_extractive_sumy (text)
luhn = LuhnSum(text)

print(lsa)

That was because while something depicted in the grainy, black-and-white version of the photograph taken by a camera aboard the rover could be interpreted to resemble the shape of a door, the agency’s Jet Propulsion Laboratory (JPL) told Snopes that it actually was a “very, very, very zoomed in shot of a tiny crevice in a rock.” “The team’s scientists underlined just how small [the crevice] is: roughly 30 centimeters wide and 45 centimeters across (11 by 17 inches),” a JPL spokesperson said via email.
[<Sentence: “They said there are linear fractures throughout this outcrop, and this is a location where several linear fractures happen to intersect.” The photograph was taken by the Mast Camera (Mastcam) outfitted aboard Curiosity, a system that uses fixed-focal length, multispectral imagers to capture “true color” images of the red planet and beyond.>]


In [None]:
textRank_sumy = text_rank(text) 
textRank_nltk = nltk_extractive_sumy(text)
print(textRank_sumy)
print (textRank_nltk)

#### Guardar no dataset 

In [10]:
import json, os
import pandas as pd
import numpy as np



In [None]:

 

data = pd.DataFrame()

for file in os.listdir("../../../extractors/Snopes/extractions")[:100]:
    with open("../../../extractors/Snopes/extractions/" + file) as f:
            d = json.load(f)
            f.close()
    text = ' '.join(d['postText'])

    dic = {'id': d['id'], 'textRank_sumy': text_rank(text) , 'textRank_nltk': nltk_extractive_sumy(text) }

    data = data.append(dic, ignore_index = True)

    data.to_csv('ext_summarization.csv')



In [21]:

df_abs = pd.read_csv('abs_summarization.csv', index_col=0)
df_abs.head()

df_ext = pd.read_csv('ext_summarization.csv', index_col=0)
df_ext.head()


for ((i_a, r_a),(i_e, r_e))  in zip(df_abs.iterrows(),df_ext.iterrows()):
        if (r_a[0] == r_e[0]): ## tem o mesmo id
                df_abs.loc[i_a,'textRank_sumy'] = r_e[1]
                df_abs.loc[i_a,'textRank_nltk'] = r_e[2]


df_abs.to_csv('summarization.csv')


In [22]:
df = pd.read_csv('summarization.csv', index_col=0)
df.head()

Unnamed: 0,id,allegation,evaluation,T5,BERT,BART,XLNet,GTP2,textRank_sumy,textRank_nltk
0,1650725000.0,Valentine's Day was invented by greeting card ...,false,the holiday falls yearly on february 14. some ...,And he called her his “very gentle Valentine.”...,"The idea that Valentine’s Day, which falls yea...",And he called her his “very gentle Valentine.”...,And he called her his “very gentle Valentine.”...,Although one can’t factually argue the holiday...,Although one can’t factually argue the holida...
1,1650726000.0,A video shows truckers from South Carolina on ...,miscaptioned,video shows truckers from south carolina in th...,,"In January 2022, as hundreds of truckers drove...",,,"In January 2022, as hundreds of truckers drove...","In January 2022, as hundreds of truckers drov..."
2,1650725000.0,An online advertisement revealed an unusual or...,false,"for at least several years, online advertiseme...","Azrieli Center, 26 Harokmim St., Holon, Israel.”","For at least several years, online advertiseme...","Azrieli Center, 26 Harokmim St., Holon, Israel.”","Azrieli Center, 26 Harokmim St., Holon, Israel.”",A spokesperson for Farmers Insurance responded...,A spokesperson for Farmers Insurance responde...
3,1650725000.0,Russia bombed a Biden-owned villa and several ...,false,a fictitious article falsely claimed a villa o...,Real Raw News is not a genuine news source. No...,"In March 2022, as Russia continued its attack ...",Real Raw News is not a genuine news source. No...,Real Raw News is not a genuine news source. No...,A disclaimer on Real Raw News states that the ...,"In March 2022, as Russia continued its attack..."
4,1650751000.0,"In September 2021, an initial news release ann...",true,a typo appeared to suggest that first-term con...,,"In September 2021, some internet users gleeful...",,,The article of impeachment against Harris accu...,"On Sept. 24, for example, political reporter ..."
