# Load the model

In [None]:
!pip install transformers datasets torch sentencepiece evaluate




In [None]:
from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, T5Tokenizer
import re

# Load dataset
ds = load_dataset("scillm/scientific_papers-archive", split="test")

# Select the first 1000 examples
small_ds = ds.select(range(1000))

# Preprocessing function to remove unwanted references
def preprocess_text(text):
    # Remove unwanted references like @xcite
    text = re.sub(r'@\w+', '', text)  # Remove anything that starts with @
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Preprocessing function
def preprocess(examples):
    # Preprocess articles and summaries
    articles = [preprocess_text(article) for article in examples["input"]]
    outputs = [preprocess_text(output) for output in examples["output"]]

    # Add prefix to the articles
    inputs = ["summarize: " + article for article in articles]

    # Tokenize articles
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")

    # Tokenize summaries
    labels = tokenizer(outputs, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Load mT5 model and tokenizer
model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Tokenize the smaller dataset
tokenized_small_ds = small_ds.map(preprocess, batched=True)

# Verify that the dataset is correctly tokenized
print(tokenized_small_ds[0])




{'id': 0, 'input': 'Suppose that you have an abstract for a scientific paper:    the short - term periodicities of the daily sunspot area fluctuations from august 1923 to october 1933 are discussed . for these data \n the correlative analysis indicates negative correlation for the periodicity of about @xmath0 days , but the power spectrum analysis indicates a statistically significant peak in this time interval . \n a new method of the diagnosis of an echo - effect in spectrum is proposed and it is stated that the 155-day periodicity is a harmonic of the periodicities from the interval of @xmath1 $ ] days .    the autocorrelation functions for the daily sunspot area fluctuations and for the fluctuations of the one rotation time interval in the northern hemisphere , separately for the whole solar cycle 16 and for the maximum activity period of this cycle do not show differences , especially in the interval of @xmath2 $ ] days . \n it proves against the thesis of the existence of strong 

In [None]:
# Split the data into train and test set
small_ds = ds.train_test_split(test_size=0.2)

In [None]:
# Check the first article and abstract
small_ds["train"][0]

{'id': 6556,
 'input': 'Provide a shorter version of the following research that reflects its organization into sections "Introduction\nMaterials and Methods\nAnalysis\nResults\nDiscussion and Conclusion".\nResearch: reproductive tract infections ( rtis ) , including both sexually transmitted infections ( stis ) and non - sexually transmitted infections ( non - stis ) of the reproductive tract are responsible for major ill - health throughout the world.(1 ) world health organization estimates that each year there are over 340 million new cases of sexually transmitted infections in which 7585% occur in developing countries . in india\nalone , 40 million new cases emerge each year.(2 ) a majority of women continue to suffer from rtis leading to complications like pelvic inflammatory disease ( pid ) , infertility , cervical cancer , postabortal , and puerperal sepsis , chronic pelvic pain , and ectopic pregnancy .\nrtis in many cases are asymptomatic among women , making their detection a

In [None]:
# Print the types of data
print(small_ds['train'].features)

{'id': Value(dtype='int64', id=None), 'input': Value(dtype='string', id=None), 'output': Value(dtype='string', id=None)}


In [None]:
print(small_ds.column_names)

{'train': ['id', 'input', 'output'], 'test': ['id', 'input', 'output']}


In [None]:
from transformers import T5Tokenizer

model_name = "google/mt5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
# Apply preprocessing function to dataset
tokenized_ds = small_ds.map(preprocess, batched=True)

Map:   0%|          | 0/104784 [00:00<?, ? examples/s]

Map:   0%|          | 0/26196 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
!pip install wandb

# Import Weights & Biases
import wandb
from transformers import MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer, TrainerCallback
import torch

# Initialize Weights & Biases
wandb.init(project="mt5-finetune", name="MT5-Summarization")

# Load the model
model_name = "google/mt5-small"
model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Set the device
device = torch.device("cpu")
model.to(device)

# Ensure model parameters are contiguous
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()
        print(f"Made {name} contiguous.")

# Define training arguments with W&B logging
training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    logging_dir='./logs',
    predict_with_generate=True,
    report_to="wandb",  # Enable W&B logging
    logging_steps=10  # Adjust how often to log metrics
)

# Define the dataset
train_dataset = tokenized_small_ds.shuffle().select(range(80))  # 80 examples for training
eval_dataset = tokenized_small_ds.shuffle().select(range(20, 100))  # 20 examples for evaluation

# Create the Trainer with W&B logging enabled
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()






Made encoder.block.0.layer.0.SelfAttention.q.weight contiguous.
Made encoder.block.0.layer.0.SelfAttention.k.weight contiguous.
Made encoder.block.0.layer.0.SelfAttention.v.weight contiguous.
Made encoder.block.0.layer.0.SelfAttention.o.weight contiguous.
Made encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight contiguous.
Made encoder.block.0.layer.1.DenseReluDense.wi_0.weight contiguous.
Made encoder.block.0.layer.1.DenseReluDense.wi_1.weight contiguous.
Made encoder.block.0.layer.1.DenseReluDense.wo.weight contiguous.
Made encoder.block.1.layer.0.SelfAttention.q.weight contiguous.
Made encoder.block.1.layer.0.SelfAttention.k.weight contiguous.
Made encoder.block.1.layer.0.SelfAttention.v.weight contiguous.
Made encoder.block.1.layer.0.SelfAttention.o.weight contiguous.
Made encoder.block.1.layer.1.DenseReluDense.wi_0.weight contiguous.
Made encoder.block.1.layer.1.DenseReluDense.wi_1.weight contiguous.
Made encoder.block.1.layer.1.DenseReluDense.wo.weight contiguous



Epoch,Training Loss,Validation Loss
1,26.8881,26.913452
2,25.254,21.377808
3,22.8586,18.79105
4,20.637,17.441799
5,20.091,16.134851
6,17.6631,14.996918
7,19.1514,14.194448
8,17.9157,13.482185
9,15.651,13.264349
10,16.6128,13.193047


TrainOutput(global_step=200, training_loss=20.901301651000978, metrics={'train_runtime': 51.1932, 'train_samples_per_second': 15.627, 'train_steps_per_second': 3.907, 'total_flos': 845999505408000.0, 'train_loss': 20.901301651000978, 'epoch': 10.0})

In [None]:
!pip install rouge_score
import evaluate

# evaluate the model and check the rouge scores
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Decode predictions and labels (remove special tokens)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in labels (ignore index) with the padding token id
    labels[labels == -100] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute ROUGE scores using the `evaluate` library
    rouge_output = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rouge1": rouge_output["rouge1"],
        "rouge2": rouge_output["rouge2"],
        "rougeL": rouge_output["rougeL"],
    }



In [None]:
# Update trainer to include costom metrics
trainer.compute_metrics = compute_metrics

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)



{'eval_loss': 13.193046569824219, 'eval_rouge1': 0.00961851830338215, 'eval_rouge2': 0.0007523018189984281, 'eval_rougeL': 0.00916387749415247, 'eval_runtime': 7.4179, 'eval_samples_per_second': 10.785, 'eval_steps_per_second': 2.696, 'epoch': 10.0}


In [None]:
# Save the fine-tuned model
trainer.save_model("fine-tuned-mt5")
tokenizer.save_pretrained("fine-tuned-mt5")

('fine-tuned-mt5/tokenizer_config.json',
 'fine-tuned-mt5/special_tokens_map.json',
 'fine-tuned-mt5/spiece.model',
 'fine-tuned-mt5/added_tokens.json')

In [None]:
from transformers import T5Tokenizer, MT5ForConditionalGeneration

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)


In [None]:
from transformers import pipeline
import torch


# Restructured input
text = (
    "Summarize the following information regarding psoriasis, its effects on skin health, and its potential health risks:\n\n"
    "1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.\n"
    "2. Immune system dysfunction causes inflammatory cells to accumulate in the dermis, the middle layer of the skin.\n"
    "3. The condition accelerates skin cell growth, with skin cells shedding more quickly than usual.\n"
    "4. This abnormal shedding results in uncomfortable symptoms like raised plaques, scales, and redness.\n"
    "5. Psoriasis not only affects the skin but also increases the risk of serious health issues, including heart disease, cancer, and inflammatory bowel disease.\n\n"
    "Please provide a summary."
)


# define the device (GPU or CPU)
device = 0 if torch.cuda.is_available() else -1

# Lload the summarizing pipeline
summarizer = pipeline("summarization", model=new_model, tokenizer=new_tokenizer, device=device)

# summarize the text
summary = summarizer(text,
                     max_length=120,
                     min_length=30,
                     do_sample=False,
                     num_beams=5,
                     repetition_penalty=5.0,
                     no_repeat_ngram_size=2,
                     length_penalty=1.0)[0]["summary_text"]
# Clean the summary by removing the <extra_id_X> token
import re
pattern = r"<(extra_id_\d+|id_\d+)>"
cleaned_summary = re.sub(pattern, " ", summary).strip()

print(cleaned_summary)


of psoriasis, and its potential health risks: 1. Psoriasis is an autoimmune condition that leads to inflammation in the skin.


In [None]:

!pip install gradio PyMuPDF

import gradio as gr
from transformers import T5Tokenizer, MT5ForConditionalGeneration
import fitz  # PyMuPDF

# Load the fine-tuned tokenizer and model
model_name = "fine-tuned-mt5"
new_tokenizer = T5Tokenizer.from_pretrained(model_name, clean_up_tokenization_spaces=True)
new_model = MT5ForConditionalGeneration.from_pretrained(model_name)

# Function to extract text from PDF using PyMuPDF
def extract_text_from_pdf(pdf_file):
    text = ""
    # Open the PDF file
    with fitz.open(pdf_file) as doc:
        for page in doc:
            text += page.get_text()  # Extract text from each page
    return text

# Summarization function
def summarize_pdf(pdf_file, max_summary_length):
    # Extract text from the PDF
    input_text = extract_text_from_pdf(pdf_file)

    # Tokenize the input to check length
    tokenized_input = new_tokenizer.encode(input_text, return_tensors='pt')



    try:
        # Generate the summary
        summary_ids = new_model.generate(
            tokenized_input,
            max_length=max_summary_length,
            min_length=30,
            num_beams=15,
            repetition_penalty=5.0,
            no_repeat_ngram_size=2
        )

        # Decode the generated summary
        summary = new_tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Clean up the summary to remove unwanted tokens
        cleaned_summary = ' '.join([token for token in summary.split() if not token.startswith('<extra_id_')]).strip()

        # Ensure the summary ends with a complete sentence
        if cleaned_summary:
            last_period_index = cleaned_summary.rfind('.')
            if last_period_index != -1 and last_period_index < len(cleaned_summary) - 1:
                cleaned_summary = cleaned_summary[:last_period_index + 1]
            else:
                cleaned_summary = cleaned_summary.strip()

        return cleaned_summary if cleaned_summary else "No valid summary generated."

    except Exception as e:
        return str(e)  # Return the error message for debugging

# Define the Gradio interface
interface = gr.Interface(
    fn=summarize_pdf,
    inputs=[
        gr.File(label="Upload PDF"),
        gr.Slider(50, 300, step=10, label="Max summary length")
    ],
    outputs="textbox",  # A textbox for the output summary
    title="PDF Text Summarizer",
    description="Upload a PDF file to summarize its content."
)


# Launch the interface with debug mode enabled
interface.launch(debug=True)


Collecting gradio
  Downloading gradio-5.3.0-py3-none-any.whl.metadata (15 kB)
Collecting PyMuPDF
  Downloading PyMuPDF-1.24.12-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.3-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.1-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://b680b19022ea163620.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
