In [1]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.1


In [3]:
pip install transformers



In [2]:
pip install sentence-transformers



In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
#define a Summarazation Function with Text_Rank method

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

def TextRank_summarization(input_text, num_sentences):
    doc = nlp(input_text)
    sentences = [sent.text.strip() for sent in doc.sents]  # Split sentences

    # Vectorize sentences
    vectorizer = CountVectorizer().fit_transform(sentences)
    vectors = vectorizer.toarray()

    # Compute cosine similarity
    cosine_matrix = cosine_similarity(vectors)

    # Build a graph based on cosine similarity
    graph = nx.from_numpy_array(cosine_matrix)
    scores = nx.pagerank(graph)

    # Rank sentences based on score and return top ones
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    top_sentences = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]

    return ' '.join(top_sentences)


In [6]:
#define a function for summarization based on words importance and frequency
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from collections import Counter
import pandas as pd

stopwords=list(STOP_WORDS)
allowed_pos=['ADJ','PRON','VERB','NOUN']

def word_importance_summarazation(input_text,num_sentences):

  doc = nlp(input_text)
  sent_token=[sent.text for sent in doc.sents] #split sentences


  #Tokenize
  tokens=[]
  for token in doc:
    if token.text not in stopwords and token.text not in punctuation and token.pos_ in allowed_pos:
      tokens.append(token.text)

  #Normolize distribution of words in sentences
  word_freq=Counter(tokens)
  for word in word_freq.keys():
    word_freq[word]=word_freq[word]/max(word_freq.values())

  #Calculate score of each sentences based on their words
  sent_score={}
  for sent in sent_token:
    for word in sent.split():
      if word.lower() in word_freq.keys():
        if sent not in sent_score.keys():
          sent_score[sent]=word_freq[word]
        else:
          sent_score[sent]+=word_freq[word]

  # buid a dataframe and sort based on the score
  df=pd.DataFrame(list(sent_score.items()),columns=['Sentences','Score'])
  df.sort_values(by='Score',ascending=False , inplace=True)

  #return Top sentences
  top_sentences=df.nlargest(num_sentences,'Score')['Sentences'].tolist()
  return ' '.join(top_sentences)


In [7]:
#define a function based on SBERTExt (SentenceTransformation)

from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def sbert_ext_summarization(text, num_sentences):
  doc = nlp(text)
  sentences=[sent.text for sent in doc.sents] #split sentences
  embeddings = model.encode(sentences)

  # Compute cosine similarity matrix
  similarity_matrix = cosine_similarity(embeddings)

  # Compute centrality scores
  sentence_scores = similarity_matrix.sum(axis=1)

  # Select top sentences by score
  ranked_indices = np.argsort(-sentence_scores)
  selected_indices = []
  for idx in ranked_indices:
      # Avoid selecting sentences that are too similar
      if len(selected_indices) >= num_sentences:
           break
      if all(cosine_similarity([embeddings[idx]], [embeddings[i]])[0][0] < 0.8 for i in selected_indices):
          selected_indices.append(idx)

  # Extract the selected sentences
  selected_indices.sort()
  summary_sentences = [sentences[i] for i in selected_indices]

  return ' '.join(summary_sentences)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Load the Pegasus model and tokenizer
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
transformer = PegasusForConditionalGeneration.from_pretrained(model_name)

def pegasus_summarazation(input_text, max_length=150, min_length=50):
    """
    Abstractive summarization using Pegasus with improved parameter tuning.
    """
    inputs = tokenizer(input_text, truncation=True, padding="longest", return_tensors="pt")
    summary_ids = transformer.generate(
        inputs["input_ids"],
        max_length=max_length,
        min_length=min_length,
        length_penalty=2.0,  # Encourage concise summaries
        num_beams=4,        # Increase beam search width
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [9]:
# Extract PDF content
import fitz  # PyMuPDF for PDF processing
from google.colab import drive
drive.mount('/content/drive')
pdf_path = "/content/drive/My Drive/article.pdf"

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with fitz.open(pdf_path) as pdf:
            for page in pdf:
                text += page.get_text("text")
    except Exception as e:
        return f"Error reading PDF: {str(e)}"
    return text if text.strip() else "No text found in the PDF."

Mounted at /content/drive


In [10]:
input_text=extract_text_from_pdf(pdf_path)

In [14]:
#Compare result of Summarazation in different methods

print(f"""The result of Extractive summarization with TextRank method:
{TextRank_summarization(input_text, 3)}

The result of Extractive summarization with Word Importance method:
{word_importance_summarazation(input_text, 3)}

The result of Extractive summarization with SBERTExt method:
{sbert_ext_summarization(input_text, 3)}

The result of Abstractive summarization with Pegasus method:
{pegasus_summarazation(input_text)}""")


The result of Extractive summarization with TextRank method: 
Consequently, we hypothesize that the amount of germane load
(i.e., the type of load related to the processing, construction and auto-
mation of schemas) will mediate the effect of the type of information
search on the quality of justifications presented in the students’ final
conclusions. In an exploratory extension of H2, we tested, whether the difference
in the quality of the arguments and reasoning presented in the students’
final conclusions between students using LLMs and students using
traditional search engines was mediated by the difference in GCL. Discussion
The results of the current study offer several intriguing insights into
the differences in cognitive load and the quality of learning outcomes
between traditional web searches and those conducted using LLMs such
as ChatGPT. 

The result of Extractive summarization with Word Importance method: 
In line with Kammerer et al. (2021) and following previous research
