<a href="https://colab.research.google.com/github/mapcrafter2048/Literature-Review-Generator-ML-17/blob/main/main_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
 data = load_dataset("scillm/scientific_papers-archive")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [1]:
import logging
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, AdamW
from datasets import load_dataset
from tqdm import tqdm

# Set logging level
logging.basicConfig(level=logging.ERROR)

# Constants
DATASET_SIZE = 2400000
SAMPLE_FRACTION = 0.004
TRAIN_TEST_SPLIT = 0.1
MAX_INPUT_LENGTH = 512  # Use 512 for T5
MIN_TARGET_LENGTH = 5
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
MAX_EPOCHS = 2
MODEL_CHECKPOINT = "t5-small"

# Load dataset
data = load_dataset("scillm/scientific_papers-archive")

# Downsample the dataset to 0.4%
sample_size = int(DATASET_SIZE * SAMPLE_FRACTION)
data = data.shuffle(seed=42)
sampled_data = data['train'].train_test_split(train_size=sample_size, seed=42)['train']

# Split the dataset into train and test
train_test_data = sampled_data.train_test_split(test_size=TRAIN_TEST_SPLIT, seed=42)
train_data = train_test_data['train']
test_data = train_test_data['test']

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Preprocess function
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["input"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["output"], max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply preprocessing
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

# Convert to PyTorch Dataset
class CustomDataset(Dataset):
    def __init__(self, data):
        self.input_ids = torch.tensor(data["input_ids"])
        self.attention_mask = torch.tensor(data["attention_mask"])
        self.labels = torch.tensor(data["labels"])

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "labels": self.labels[idx]
        }

train_dataset = CustomDataset(tokenized_train)
test_dataset = CustomDataset(tokenized_test)

# Convert to DataLoader
def create_dataloader(dataset, batch_size):
    return DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)

train_dataloader = create_dataloader(train_dataset, BATCH_SIZE)
test_dataloader = create_dataloader(test_dataset, BATCH_SIZE)

# Setup for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
def train():
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_dataloader)

# Evaluation loop
def evaluate():
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_loss += loss.item()
    return total_loss / len(test_dataloader)

# Training and evaluation
for epoch in range(MAX_EPOCHS):
    print(f"Epoch {epoch + 1}/{MAX_EPOCHS}")
    train_loss = train()
    print(f"Training Loss: {train_loss:.4f}")

    eval_loss = evaluate()
    print(f"Evaluation Loss: {eval_loss:.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/960 [00:00<?, ? examples/s]



Epoch 1/2


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
Training: 100%|██████████| 1080/1080 [06:01<00:00,  2.99it/s]


Training Loss: 2.7910


Evaluating: 100%|██████████| 120/120 [00:13<00:00,  8.61it/s]


Evaluation Loss: 2.3611
Epoch 2/2


Training: 100%|██████████| 1080/1080 [06:01<00:00,  2.99it/s]


Training Loss: 2.4513


Evaluating: 100%|██████████| 120/120 [00:13<00:00,  8.64it/s]

Evaluation Loss: 2.2732





In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Constants
MODEL_CHECKPOINT = "t5-small"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 128

# Load model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Ensure model is in evaluation mode and on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Generating summaries
def generate_summaries(input_texts):
    # Tokenize the input texts
    inputs = tokenizer(input_texts, return_tensors="pt", max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length").to(device)

    # Generate summaries
    summaries = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=MAX_TARGET_LENGTH,
        num_beams=4,               # Beam search for better summaries
        early_stopping=True
    )

    # Decode the generated summaries
    return tokenizer.batch_decode(summaries, skip_special_tokens=True)

# Example usage
sample_inputs = [
    "Quantum computing is revolutionizing the field of computation by leveraging quantum mechanics to solve problems beyond the reach of classical computers. This paper offers a comprehensive review of recent advancements in quantum computing, focusing on innovations in quantum hardware such as superconducting and topological qubits, and notable quantum algorithms including Shor’s and Grover’s algorithms that provide significant speedups over classical methods. We also examine progress in error correction techniques like surface codes and cat codes, which are crucial for managing quantum decoherence and scaling quantum systems. Additionally, the paper explores the transformative applications of quantum computing in areas such as cryptography, where it poses both challenges and opportunities, material science with its potential for groundbreaking simulations, and artificial intelligence, where quantum algorithms could enhance machine learning and optimization tasks. Despite these advancements, the field faces ongoing challenges, including the need for scalable systems and practical applications, making future research essential for realizing quantum computing’s full potential.",

]
generated_summaries = generate_summaries(sample_inputs)

# Print results
for text, summary in zip(sample_inputs, generated_summaries):
    print(f"Input: {text}")
    print(f"Summary: {summary}\n")


Input: Quantum computing is revolutionizing the field of computation by leveraging quantum mechanics to solve problems beyond the reach of classical computers. This paper offers a comprehensive review of recent advancements in quantum computing, focusing on innovations in quantum hardware such as superconducting and topological qubits, and notable quantum algorithms including Shor’s and Grover’s algorithms that provide significant speedups over classical methods. We also examine progress in error correction techniques like surface codes and cat codes, which are crucial for managing quantum decoherence and scaling quantum systems. Additionally, the paper explores the transformative applications of quantum computing in areas such as cryptography, where it poses both challenges and opportunities, material science with its potential for groundbreaking simulations, and artificial intelligence, where quantum algorithms could enhance machine learning and optimization tasks. Despite these adva