**Install Modules**

In [None]:
%pip install Transformers==4.16

In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [5]:
%pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m1.0/1.3 MB[0m [31m30.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [4]:
# initialize the pretrained model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.27k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [None]:
# input text
text = """
Data science combines math and statistics, specialized programming, advanced analytics, artificial intelligence (AI), and machine learning with specific subject matter expertise to uncover actionable insights hidden in an organization’s data. These insights can be used to guide decision making and strategic planning.
"""

In [None]:
## preprocess the input text
preprocessed_text = text.strip().replace('\n','')
t5_input_text = 'summarize: ' + preprocessed_text

In [None]:
t5_input_text

'summarize: Data science combines math and statistics, specialized programming, advanced analytics, artificial intelligence (AI), and machine learning with specific subject matter expertise to uncover actionable insights hidden in an organization’s data. These insights can be used to guide decision making and strategic planning.'

In [None]:
len(t5_input_text.split())

43

In [None]:
tokenized_text = tokenizer.encode(t5_input_text, return_tensors='pt', max_length=512, truncation=True).to(device)

##Summary

In [None]:
summary_ids = model.generate(tokenized_text, min_length=20, max_length=120)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
summary

'data science combines math and statistics, specialized programming, advanced analytics, artificial intelligence (AI) and machine learning with specific subject matter expertise. these insights can be used to guide decision making and strategic planning.'

In [None]:
len(summary.split())

33

#### Another example with Sci-bert(Not good at all)

In [7]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

# SciBERT is a pretrained model on scientific text from AllenAI. It is based on BERT. 
# It is trained on 1.14M papers from Semantic Scholar and arXiv.org.



# Load the SciBERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def generate_summary(text):
    # Tokenize the input text
    input_ids = tokenizer.encode(text, truncation=True, max_length=512, return_tensors="pt")

    # Generate the summary
    summary_ids = model.generate(input_ids, num_beams=4, max_length=300, early_stopping=True)

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

# Example usage
input_text = """
in 2003, Yoshua Bengio with co-authors tried to use a multi-layer perceptron with a single hidden layer and context length of several words trained on up to 14 million of words with a CPU cluster in language modelling and overperformed the best of n-gram models (a typical statistical alogrithm) then available.[7] In 2010, Tomáš Mikolov (then a PhD student at Brno University of Technology) with co-authors applied a simple recurrent neural network with a single hidden layer to language modelling,[8] and in the following years he went on to develop Word2vec.In the 2010s, representation learning and deep neural network-style (featuring many hidden layers) machine learning methods became widespread in natural language processing. That popularity was due partly to a flurry of results showing that such techniques[9][10] can achieve state-of-the-art results in many natural language tasks, e.g., in language modeling[11] and parsing.[12][13] This is increasingly important in medicine and healthcare, where NLP helps analyze notes and text in electronic health records that would otherwise be inaccessible for study when seeking to improve care[14] or protect patient privacy.[15]
"""
summary = generate_summary(input_text)

In [14]:
summary

'in 2003, yoshua bengio with co - authors tried to use a multi - layer perceptron with a single hidden layer and context length of several words trained on up to 14 million of words with a cpu cluster in language modelling and overperformed the best of n - gram models ( a typical statistical alogrithm ) then available. [ 7 ] in 2010, tomas mikolov ( then a phd student at brno university of technology ) with co - authors applied a simple recurrent neural network with a single hidden layer to language modelling, [ 8 ] and in the following years he went on to develop word2vec. in the 2010s, representation learning and deep neural network - style ( featuring many hidden layers ) machine learning methods became widespread in natural language processing. that popularity was due partly to a flurry of results showing that such techniques [ 9 ] [ 10 ] can achieve state - of - the - art results in many natural language tasks, e. g., in language modeling [ 11 ] and parsing. [ 12 ] [ 13 ] this is 

#### google pegasus

In [None]:
text_example = "Data-moshing was a term I became aware of later. Initially it was just an aesthetic tool at my disposable. I didn’t consider opening a jpeg in a text or hex editor to be in anyway philosophical, non the less political. At no point did I find myself reflecting on the dimensional implications of turning a png or bmp file into raw data and importing it into audacity. Glitch aesthetic was a technique oriented interaction. Digital material abstracted for the sake of form. Form over content or meaning. Changing bytes or misusing applications never meant a resistance, nor was it an attempt to fracture a simulation. Similar to the Romantic Era of adding chromaticism and extended mixed modal progressions into a composer’s harmonic vocabulary and complexity of expression, glitch was just a method of interacting with and extending a medium. Did Clare de Lune have a social statement about the nature of Power embedded in the use of chromatic motion in both the harmonic outline and melody? Was there a critique on hyperreality inferred somewhere in a transition or dynamic shift? To me, glitch was just another dialect of a visual language used both independently and commercially. I associated no deeper meaning than an aesthetic result from an agnostic process.  Glitch Art has been a hobby of mine for sometime. Before I was interested in data bending raw data and file formats, I was experimenting with circuit bending. Often experimenting to make something from a broken keyboard of an electronic toy with the idea of sampling the textures. When I came across the terms: “Post-Internet” , “Post-Data” , “Post-Digital” — I was intrigued to find a whole community of people who glitched files and rerouted circuit boards. It didn’t surprise me to recognize a strong Post-Modern , Post-Structuralist ideological origin at the base of the Post-Internet mission.  In this blog I take a look at some main figures in the Post-Internet glitch movement and the overall messaging of the aesthetic. And ask: What is the point? How does it help? What is its impact?  Post-Internet simply means since the internet has been around. With art it more specifically refers to Internet Art that was made of or about the effects internet has on society and culture in the early to mid 2000's."

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


model = AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-cnn_dailymail')
tokenizer = AutoTokenizer.from_pretrained('google/pegasus-cnn_dailymail')

# model = AutoModelForSeq2SeqLM.from_pretrained('google/pegasus-xsum')
# tokenizer = AutoTokenizer.from_pretrained('google/pegasus-xsum')

In [None]:
tokens_input = tokenizer.encode("summarize: "+ text_example, return_tensors='pt', max_length=512, truncation=True)
ids = model.generate(tokens_input, min_length=50, max_length=100)
summary = tokenizer.decode(ids[0], skip_special_tokens=True)

In [None]:
len(text_example)

In [None]:
len(summary)

In [None]:
summary

In [None]:
from datasets import load_dataset, load_metric

In [None]:
rouge_metric = load_metric('rouge')
rouge_metric.add(prediction = summary, reference = text_example )
score = rouge_metric.compute()

In [None]:
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
score[rouge_names[0]].mid.fmeasure