# Abstractive summarization

## T5 transformer

In [40]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

In [41]:

def T5_summarization_model (text):
    # initialize the model architecture and weights
    t5model = T5ForConditionalGeneration.from_pretrained("t5-base")
    # initialize the model tokenizer
    t5tokenizer = T5Tokenizer.from_pretrained("t5-base")
    device = torch.device('cpu')
    
    t5tokenized_text = t5tokenizer.encode("summarize:"+ text,
                                    truncation=True,
                                    return_attention_mask=True,
                                    add_special_tokens=True, 
                                    padding='max_length',     
                                    return_tensors="pt").to(device)


    t5summary_ids =  t5model.generate(input_ids=t5tokenized_text,
                    num_beams=3,  ##modelo olha para 3 possiveis palavras
                    min_length=20, ## number min of tokens
                    max_length=70,  ##number maximo of tokens
                    repetition_penalty=1.0,
                    early_stopping=True)

    output = t5tokenizer.decode(t5summary_ids[0],  
                             skip_special_tokens=True, 
                             clean_up_tokenization_spaces=True)
    return  output



## Summarization with BERT Model

In [4]:
import torch
import json
from summarizer import Summarizer
from summarizer import TransformerSummarizer

In [45]:


def bert_summarization_model (text):
    #Create default summarizer model
    # By default bert-extractive-summarizer uses the ‘bert-large-uncased‘ pretrained model.
    bert_model = Summarizer()

    summary = ''.join(bert_model(text, max_length=50))
    return summary 

In [None]:
for file in os.listdir("../../../extractors/Snopes/extractions")[:5]:
    with open("../../../extractors/Snopes/extractions/" + file) as f:
            data = json.load(f)
            f.close()

    summary = bert_summarization_model (' '.join(data['postText']))
    print(summary)

## Summarization with GPT2 Model

In [46]:
def gtp2_summariation_model(text):
    GPT2_model = TransformerSummarizer(transformer_type="GPT2",
                         transformer_model_key="gpt2-medium")
    summary = ''.join(GPT2_model(text,  max_length=50))
    return summary

In [17]:
for file in os.listdir("../../../extractors/Snopes/extractions")[:1]:
    with open("../../../extractors/Snopes/extractions/" + file) as f:
            data = json.load(f)
            f.close()

    summary = gtp2_summariation_model (' '.join(data['postText']))
    print(summary)

Valentine’s Day is a holiday invented by greeting card companies to make people feel like crap,” Carrey’s character, Joel, stated. English audiences embraced the idea of February mating. According to the Encyclopedia Britannica, it wasn’t until the 1700s that card sellers began selling commercially printed valentines. “ Howland “started making her first valentines in 1848 with an assembly-line operation.


## Summarization with XLNet

In [47]:
def XLNet_summarization_model (text):
    xlnet_model = TransformerSummarizer(transformer_type="XLNet",
                     transformer_model_key="xlnet-base-cased")
    summary = ''.join(xlnet_model(text, max_length=50))

    return summary

In [36]:
for file in os.listdir("../../../extractors/Snopes/extractions")[2:3]:
    with open("../../../extractors/Snopes/extractions/" + file) as f:
            data = json.load(f)
            f.close()

    summary = XLNet_summarization_model (' '.join(data['postText']))
    print(summary)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetModel: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


One ad read: “Unique Trick If Your Car Has Automatic Headlights.” Another said: “An Unusual Tip If Your Car Has Automatic Headlights.”


## Summaziton with BART MODEL 

#### From META

In [65]:
from transformers import pipeline

def bart_summarization_model(text):
    summarizer = pipeline('summarization', model='facebook/bart-base', tokenizer='facebook/bart-large-cnn')
   
    summary = summarizer(text, max_length = 50, truncation=True)[0]['summary_text']
    return summary

  

## Choose 500 posts for test

In [63]:
import os
import json
import pandas as pd

In [70]:
df = pd.DataFrame()

for file in os.listdir("../../../extractors/Snopes/extractions")[:100]:
    with open("../../../extractors/Snopes/extractions/" + file) as f:
            data = json.load(f)
            f.close()



    summary_T5 = T5_summarization_model (' '.join(data['postText']))
    summary_BERT = bert_summarization_model(' '.join(data['postText']))
    summary_BART = bart_summarization_model(' '.join(data['postText']))
    summary_XLNET =  XLNet_summarization_model(' '.join(data['postText']))
    summary_GTP2 = gtp2_summariation_model (' '.join(data['postText']))
    dict = {'id': data['id'], 'allegation':data['allegation'], 'evaluation':data['evaluation'], 'T5':summary_T5, 'BERT': summary_BERT, 'BART': summary_BART, 'XLNet':summary_XLNET , 'GTP2': summary_GTP2 }

    df = df.append(dict, ignore_index = True)


df.to_csv('abs_summarization.csv')



    

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a Bert