In [24]:
!pip install nltk transformers



In [25]:
import nltk
import torch
from nltk.corpus import stopwords
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, pipeline

In [26]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
def split_text_into_sentences(text):
    return nltk.sent_tokenize(text)

In [28]:
def generate_abstractive_summary(text, model, tokenizer, min_length=30, max_length=150):

    tokens = tokenizer(text, truncation=True, padding="longest", return_tensors="pt")

    encoded_summary = model.generate(**tokens)

    decoded_summary = tokenizer.decode(encoded_summary[0], skip_special_tokens=True)

    return decoded_summary

In [29]:
def generate_extractive_summary(text, num_sentences=3):
    sentences = split_text_into_sentences(text)

    num_sentences = min(num_sentences, len(sentences))

    summary = ' '.join(sentences[:num_sentences])

    return summary

In [30]:
def compare_summaries(abstractive_summary, extractive_summary, pipeline_summary):
    print("\nSummary Comparison:")
    print(f"Abstractive Summary: {abstractive_summary}")
    print(f"Extractive Summary: {extractive_summary}")
    print(f"Pipeline Summary: {pipeline_summary}")

In [31]:
def generate_pipeline_summary(text, summarizer, min_length=30, max_length=150):
    pipeline_summary = summarizer(text, min_length=min_length, max_length=max_length)
    return pipeline_summary[0]["summary_text"]

In [32]:
model_name = "google/pegasus-xsum"
pegasus_tokenizer = PegasusTokenizer.from_pretrained(model_name)
pegasus_model = PegasusForConditionalGeneration.from_pretrained(model_name)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
summarizer = pipeline(
  "summarization",
  model=model_name,
  tokenizer=pegasus_tokenizer,
  framework="pt"
)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
def preprocess_text(text):
    text = text.lower()
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

In [35]:
example_text = """Deep learning (also known as deep structured learning) ispart of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning,recurrent neural networks and convolutional neural networks have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance. Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, neural networks tend to be static and symbolic, while the biological brain of most living organisms is dynamic (plastic) and analogue. The adjective "deep" in deep learning refers to the use of multiple layers in the network. Early work showed that a linear perceptron cannot be a universal classifier, but that a network with a nonpolynomial activation function with one hidden layer of unbounded width can. Deep learning is a modern variation which is concerned with an unbounded number of layers of bounded size, which permits practical application and optimized implementation, while retaining theoretical universality under mild conditions. In deep learning the layers are also permitted to be heterogeneous and to deviate widely from biologically informed connectionist models, for the sake of efficiency, trainability and understandability, whence the structured part."""

In [36]:
preprocessed_text = preprocess_text(example_text)
print(preprocessed_text)

['deep', 'learning', '(', 'also', 'known', 'deep', 'structured', 'learning', ')', 'ispart', 'broader', 'family', 'machine', 'learning', 'methods', 'based', 'artificial', 'neural', 'networks', 'representation', 'learning', '.', 'learning', 'supervised', ',', 'semi-supervised', 'unsupervised', '.', 'deep-learning', 'architectures', 'deep', 'neural', 'networks', ',', 'deep', 'belief', 'networks', ',', 'deep', 'reinforcement', 'learning', ',', 'recurrent', 'neural', 'networks', 'convolutional', 'neural', 'networks', 'applied', 'fields', 'including', 'computer', 'vision', ',', 'speech', 'recognition', ',', 'natural', 'language', 'processing', ',', 'machine', 'translation', ',', 'bioinformatics', ',', 'drug', 'design', ',', 'medical', 'image', 'analysis', ',', 'material', 'inspection', 'board', 'game', 'programs', ',', 'produced', 'results', 'comparable', 'cases', 'surpassing', 'human', 'expert', 'performance', '.', 'artificial', 'neural', 'networks', '(', 'anns', ')', 'inspired', 'informati

In [37]:
abstractive_summary = generate_abstractive_summary(' '.join(preprocessed_text), pegasus_model, pegasus_tokenizer)
pipeline_summary = generate_pipeline_summary(' '.join(preprocessed_text), summarizer)

In [38]:
print("Abstractive Summary:")
print(abstractive_summary)
print("\nExtractive Summary:")
print(generate_extractive_summary(' '.join(preprocessed_text)))
print("\nPipeline Summary:")
print(pipeline_summary)

Abstractive Summary:
Deep learning is a branch of computer science that deals with the study of machine learning.

Extractive Summary:
deep learning ( also known deep structured learning ) ispart broader family machine learning methods based artificial neural networks representation learning . learning supervised , semi-supervised unsupervised . deep-learning architectures deep neural networks , deep belief networks , deep reinforcement learning , recurrent neural networks convolutional neural networks applied fields including computer vision , speech recognition , natural language processing , machine translation , bioinformatics , drug design , medical image analysis , material inspection board game programs , produced results comparable cases surpassing human expert performance .

Pipeline Summary:
Deep learning is a branch of computer science that deals with the study of natural language processing , machine translation , drug design and medical image analysis.


In [39]:
compare_summaries(abstractive_summary, generate_extractive_summary(' '.join(preprocessed_text)), pipeline_summary)


Summary Comparison:
Abstractive Summary: Deep learning is a branch of computer science that deals with the study of machine learning.
Extractive Summary: deep learning ( also known deep structured learning ) ispart broader family machine learning methods based artificial neural networks representation learning . learning supervised , semi-supervised unsupervised . deep-learning architectures deep neural networks , deep belief networks , deep reinforcement learning , recurrent neural networks convolutional neural networks applied fields including computer vision , speech recognition , natural language processing , machine translation , bioinformatics , drug design , medical image analysis , material inspection board game programs , produced results comparable cases surpassing human expert performance .
Pipeline Summary: Deep learning is a branch of computer science that deals with the study of natural language processing , machine translation , drug design and medical image analysis.
