## Instalation and imports

In [None]:
!pip install --upgrade pip
!pip install transformers
!pip install sentencepiece

In [None]:
from google.colab import drive

from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DistilBertTokenizerFast
from transformers import AutoModelWithLMHead

import pickle as pkl
import math

## Distilbart model comparisson

In [None]:
try:
  drive.mount('/content/drive/')
  summarized_file = open(f"/content/drive/MyDrive/[Personal-route]/summarized_papers.pkl", "rb")
  summarized_papers = pkl.load(summarized_file)
  summarized_file.close()
  drive.flush_and_unmount()
except IOError:
  summarized_papers = {}

In [None]:
def save_changes():
  drive.mount('/content/drive/')
  summarized_file = open(f"/content/drive/MyDrive/TFG Uxio y Luis/Notebook/Pruebas/data/Output/summarized_papers_distilbert.pkl", "wb")
  pkl.dump(summarized_papers, summarized_file)
  summarized_file.close()
  drive.flush_and_unmount()

In [None]:
def clear_model():
  if 'tokenizer' in globals():
    del tokenizer
  if 'model' in globals():
   del model
  if 'tokenizer' in globals():
   del tokenizer

In [None]:
def summarize(summarizer, tokenizer, text, model_name, model_max=1024, min_length=75, max_length=300):
  encoded = tokenizer(text)['input_ids']
  print(len(encoded))
  #If the number of tokens can't be handled by the model, the text is divided
  if(len(encoded) > model_max):
    aux_summary = []
    counter = 1
    iterations = math.floor(len(encoded) / model_max + 1)
    #Add to an auxiliary array the summary of each paper division
    for i in range(0, len(encoded), model_max):
      print(f'iteration {counter}/{iterations}')
      try:
        mysumm = summarizer(tokenizer.decode(encoded[i:i+model_max]), min_length=min_length, max_length=max_length)
        print(mysumm)
        aux_summary.append(mysumm)
      except:
        completed = False
        while not completed:
          try:
            model_max = model_max - 50
            mysumm = summarizer(tokenizer.decode(encoded[i:i+model_max]), min_length=min_length, max_length=max_length)
            print(mysumm)
            aux_summary.append(mysumm)
            completed= True
          except:
            print("Shortening limit")
      counter += 1
    #Join the summaries of each division
    resul = join_summaries(aux_summary)
    #If the resulting summary is longer than the specified length, execute funcion again on summary
    if(len(resul.split()) > max_length):
      print('Too long, compressing')
      summarize(summarizer, tokenizer, resul, model_name, model_max, min_length, max_length)
    else:
      print(f'Your summary is {resul}')
      summarized_papers[model_name].append(resul)
    
  else:
    try:
      summary = summarizer(text, min_length=min_length, max_length=max_length)[0]['summary_text']
    except:
      summarize(summarizer, tokenizer, text, model_name, model_max - 50)
    print(f'Your summary is {summary}')
    summarized_papers[model_name].append(summary)

#Join all summaries processed from de text
def join_summaries(aux_summary):
  summary_text = ''
  for i in aux_summary:
    summary_text += i[0]['summary_text'] 
  return summary_text

def applyModel(model_name, model_max=1024):
  if model_name not in summarized_papers:
    summarized_papers[model_name] = []
  for i, paper in enumerate(processed_papers[len(summarized_papers[model_name]):]):
    print(f'Summarizing paper {len(summarized_papers[model_name]) + 1} using {model_name} model')
    summarize(summarizer, tokenizer, paper[1], model_name, model_max)
    save_changes()
  clear_model()

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
summarizer = pipeline("summarization")

In [None]:
clear_model()

In [None]:
applyModel('distilbert', 1000)

## Complementary model comparisson

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
model = AutoModelForSeq2SeqLM.from_pretrained("google/bigbird-pegasus-large-arxiv")
summarizer = pipeline("summarization", tokenizer=tokenizer, model=model)

In [None]:
applyModel('pegasus_arxiv', 4050)

In [None]:
summarized_papers = {'pegasus_arxiv':[]}
for i, paper in enumerate(processed_papers):
    print(f'Summarizing paper {i} using pegasus_arxiv model')
    summarize(summarizer, tokenizer, paper[1], 'pegasus_arxiv', 4050)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")
summarizer = pipeline("summarization", tokenizer=tokenizer, model=model)

In [None]:
applyModel('pegasus_cnn', 1000)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/prophetnet-large-uncased-cnndm")
summarizer = pipeline("summarization", tokenizer=tokenizer, model=model)

In [None]:
applyModel('prophetnet', 4050)

In [None]:
summarized_papers = {'prophetnet':[]}
for i, paper in enumerate(processed_papers):
    print(f'Summarizing paper {i} using prophetnet model')
    summarize(summarizer, tokenizer, paper[1], 'prophetnet', 4050)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")

model = AutoModelWithLMHead.from_pretrained("t5-base")
summarizer = pipeline("summarization", tokenizer=tokenizer, model=model)

In [None]:
applyModel('t5', 500)

## Abstract summarizer

In [None]:
drive.mount('/content/drive/')
processed_file = open(f"/content/drive/MyDrive/[Personal-route]/processed_papers.pkl", "rb")
processed_papers = pkl.load(processed_file)
processed_file.close()
drive.flush_and_unmount()

In [None]:
content_list = [x[1] for x in processed_papers]
content_list

In [None]:
DISTILBERT_MAX = 1000
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
summarizer = pipeline("summarization")

def summarize(text: list, min_length=75, max_length=300):
  summaries = []
  for i in text:
    summaries.append(summarize(i, min_length, max_length))
  return summaries

def summarize(text: str, min_length=75, max_length=300):
  encoded = tokenizer(text)['input_ids']
  print(len(encoded))
  #If the number of tokens can't be handled by the model, the text is divided
  if(len(encoded) > DISTILBERT_MAX):
    aux_summary = []
    counter = 1
    iterations = math.floor(len(encoded) / DISTILBERT_MAX + 1)
    #Add to an auxiliary array the summary of each paper division
    for i in range(0, len(encoded), DISTILBERT_MAX):
      mysumm = summarizer(tokenizer.decode(encoded[i:i+DISTILBERT_MAX]), min_length=min_length, max_length=max_length)
      aux_summary.append(mysumm)
      print(f'iteration {counter}/{iterations} {mysumm}')
      counter += 1
    #Join the summaries of each division
    resul = join_summaries(aux_summary)
    print(resul)
    #If the resulting summary is longer than the specified length, execute funcion again on summary
    if(len(resul.split()) > max_length):
      print('Too long, compressing')
      summarize(resul, min_length, max_length)
    else:
      print(f'Your summary is {resul}')
      return resul
    
  else:
    summary = summarizer(text, min_length=min_length, max_length=max_length)[0]['summary_text']
    print(f'Your summary is {summary}')
    return summary

#Join all summaries processed from de text
def join_summaries(aux_summary):
  summary_text = ''
  for i in aux_summary:
    summary_text += i[0]['summary_text'] 
  return summary_text

In [None]:
summarize(example)

In [None]:
for i in content_list:
  summarize(i)

In [None]:
#Example of use
text_to_summarize = "x"
summary = summarize(text_to_summarize)

#Example of list use
texts_to_summarize = ["x", "y", "z"]
summaries = summarize(texts_to_summarize)