<a href="https://colab.research.google.com/github/mapcrafter2048/Literature-Review-Generator-ML-17/blob/main/LRG_using_Hugging_Face_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tensorflow==2.10

In [None]:
!git clone https://github.com/huggingface/transformers.git
%cd transformers
!pip install .
!pip install huggingface-hub
!pip install keras_nlp
!pip install datasets
!pip install nltk
!pip install rouge-score

In [None]:
import os
import logging
import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

tf.get_logger().setLevel(logging.ERROR)

In [None]:
TRAIN_TEST_SPLIT = 0.1
MAX_INPUT_LENGTH = 1024
MIN_TARGET_LENGTH = 5
MAX_TARGET_LENGTH = 128
BATCH_SIZE = 8
MAX_EPOCHS = 2
MODEL_CHECKPOINT = "t5-small"

In [None]:
from datasets import load_dataset
data = load_dataset("xsum", split="train")

In [None]:
print(data)

In [None]:
print(data[0])

In [None]:
data = data.train_test_split(
    train_size=TRAIN_TEST_SPLIT, test_size=TRAIN_TEST_SPLIT
)

In [None]:
!pip install --upgrade tf_keras

In [None]:
!pip show tensorflow
!pip show tf_keras

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,AutoTokenizer

model = TFAutoModelForSeq2SeqLM.from_pretrained(MODEL_CHECKPOINT)
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = data.map(preprocess_function, batched=True)

In [None]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
test_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=BATCH_SIZE,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)
generation_dataset = (
    tokenized_datasets["test"]
    .shuffle()
    .select(list(range(200)))
    .to_tf_dataset(
        batch_size=BATCH_SIZE,
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        collate_fn=data_collator,
    )
)

In [None]:
optimizer = 'adam'
model.compile(optimizer=optimizer)

In [None]:
import keras_nlp

rouge_l = keras_nlp.metrics.RougeL()


def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)

    result = {"RougeL": result["f1_score"]}

    return result


In [None]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=generation_dataset, predict_with_generate=True
)

callbacks = [metric_callback]

model.fit(
    train_dataset, validation_data=test_dataset, epochs=MAX_EPOCHS, callbacks=callbacks
)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

summarizer(
    data["test"][0]["document"],
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

article = "The idea of India and China being close friends isn't new. Historically, they've acted like quarrelsome neighbours who fight and make up repeatedly. They share a long history of cultural and economic exchange dating back over two millennia. The Silk Road facilitated trade, and Buddhism, which started in India, found a significant following in China. These ancient ties laid a foundation of mutual respect and cultural affinity. During the colonial era, both countries faced subjugation by Western powers, fostering a sense of shared struggle "

summarizer(
    article,
    min_length=MIN_TARGET_LENGTH,
    max_length=MAX_TARGET_LENGTH,
)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Import the necessary library
from transformers import TFT5ForConditionalGeneration

# Assuming 'model' is your TFT5ForConditionalGeneration model
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/summarisation_HF')