# Title Generation for arXiv dataset

In [None]:
# Installing libraries in Google Colab:

In [54]:
pip install transformers datasets rouge_score

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


In [None]:
# STEPS

# Load dataset
# Preprocessing
# fine-tuning

# source: https://github.com/huggingface/notebooks/blob/master/examples/summarization-tf.ipynb

### loading the dataset

In [9]:
from datasets import load_metric, load_dataset

In [10]:
model_checkpoint = "t5-small"

In [30]:
dataset = load_dataset('csv', data_files='./input/sample_df_2021.csv', split='train[:10%]')

Using custom data configuration default-eef37d2e802815aa
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-eef37d2e802815aa/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a)


In [31]:
dataset[0]

{'abstract': "  Impact evaluation in public engagement necessarily requires measuring change.\nHowever, this is extremely challenging for drop-in activities due to their very\nnature. We present a novel method of impact evaluation which integrates\ngraffiti walls into the experience both before and after the main drop-in\nactivity. The activity in question was a soundscape exhibit, where young\nfamilies experienced the usually inaudible sounds of near-Earth space in an\nimmersive and accessible way. We apply two analysis techniques to the captured\nbefore and after data - quantitative linguistics and thematic analysis. These\nanalyses reveal significant changes in participants' responses after the\nactivity compared to before, namely an increased diversity in language used to\ndescribe space and altered conceptions of what space is like. The results\ndemonstrate that the soundscape was surprisingly effective at innately\ncommunicating key aspects of the underlying science simply throug

In [32]:
dataset = dataset.train_test_split()

In [33]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'category', 'abstract', 'general_category'],
        num_rows: 7550
    })
    test: Dataset({
        features: ['id', 'title', 'category', 'abstract', 'general_category'],
        num_rows: 2517
    })
})

### preprocessing

In [34]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [35]:
tokenizer("Hello, this one sentence!")

{'input_ids': [8774, 6, 48, 80, 7142, 55, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [36]:
if model_checkpoint in ["t5-small", "t5-base", "t5-larg", "t5-3b", "t5-11b"]:
    prefix = "summarize: "
else:
    prefix = ""

In [37]:
max_input_length = 1024
max_target_length = 128


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["abstract"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["title"], max_length=max_target_length, truncation=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [38]:
preprocess_function(dataset['train'][:2])

{'input_ids': [[21603, 10, 282, 7, 343, 53, 414, 1105, 12, 2862, 5327, 772, 45, 3, 9, 508, 17953, 19, 46, 359, 682, 21, 1249, 18, 12563, 4476, 1357, 492, 5, 304, 1115, 48, 682, 6, 420, 18, 157, 11, 5796, 747, 13154, 43, 118, 5456, 7546, 6, 68, 79, 321, 43, 18340, 3314, 1549, 7, 6, 3, 23, 5, 15, 5, 6, 8, 1139, 893, 65, 12, 370, 3, 9, 806, 6637, 1681, 42, 8519, 186, 772, 5, 37, 3, 157, 18, 60, 3584, 17, 8984, 1707, 11417, 19, 4382, 6, 84, 9162, 7, 8, 7565, 7, 13, 420, 18, 157, 11, 5796, 747, 13154, 5, 6984, 12, 8, 3, 9082, 18, 5651, 655, 13, 8, 682, 6, 8, 3, 157, 18, 60, 3584, 17, 8984, 1707, 11417, 19, 97, 3, 10862, 11, 8, 30337, 63, 4732, 19, 5456, 7546, 5, 611, 6, 4727, 13605, 1693, 13, 8, 30337, 63, 6315, 21, 8, 463, 13, 8, 3666, 772, 19, 341, 16914, 5, 86, 48, 1040, 6, 62, 166, 14, 48, 6813, 57, 13646, 3, 9, 529, 1788, 5907, 40, 13605, 1693, 13, 8, 1120, 12907, 603, 257, 5688, 13, 8, 3666, 772, 5, 304, 1634, 95, 11417, 3026, 6, 3, 9, 17222, 18, 390, 1573, 6, 8272, 75, 10572, 517, 60

In [39]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

## Fine-tuning the model

In [40]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [41]:
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [42]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [43]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['abstract', 'attention_mask', 'category', 'general_category', 'id', 'input_ids', 'labels', 'title'],
        num_rows: 7550
    })
    test: Dataset({
        features: ['abstract', 'attention_mask', 'category', 'general_category', 'id', 'input_ids', 'labels', 'title'],
        num_rows: 2517
    })
})

In [44]:
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    batch_size=batch_size,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    collate_fn=data_collator,
)
validation_dataset = tokenized_datasets["test"].to_tf_dataset(
    batch_size=8,
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    collate_fn=data_collator,
)

In [45]:
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [47]:
model.fit(train_dataset, validation_data=validation_dataset, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f12ef78ca50>

## Evaluate the performance of the model

In [48]:
import numpy as np

decoded_predictions = []
decoded_labels = []
for batch in validation_dataset:
    labels = batch["labels"]
    predictions = model.predict_on_batch(batch)["logits"]
    predicted_tokens = np.argmax(predictions, axis=-1)
    decoded_predictions.extend(
        tokenizer.batch_decode(predicted_tokens, skip_special_tokens=True)
    )
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels.extend(tokenizer.batch_decode(labels, skip_special_tokens=True))

In [50]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [55]:
import nltk
import numpy as np

metric = load_metric("rouge")

# Rouge expects a newline after each sentence
decoded_predictions = [
    "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions
]
decoded_labels = [
    "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
]

result = metric.compute(
    predictions=decoded_predictions, references=decoded_labels, use_stemmer=True
)
# Extract a few results
result = {key: value.mid.fmeasure for key, value in result.items()}

# Add mean generated length
prediction_lens = [
    np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
]
result["gen_len"] = np.mean(prediction_lens)

print({k: round(v, 4) for k, v in result.items()})

{'rouge1': 0.3297, 'rouge2': 0.1578, 'rougeL': 0.3183, 'rougeLsum': 0.3189, 'gen_len': 931712.0}


## Inference

In [65]:
dataset['test']['abstract'][0]

'  In this paper, a dynamic-programming approach to the coupled translational\nand rotational control of thruster-driven spacecraft is studied. To reduce the\ncomplexity of the problem, dynamic-programming-based optimal policies are\ncalculated using decoupled position and attitude dynamics with generalized\nforces and torques as controls. A quadratic-programming-based control\nallocation is then used to map the controls to actuator commands. To control\nthe spacecraft in the event of thruster failure, both the dynamic programming\npolicies and control allocation are reconfigured to cope with the losses in\ncontrols. The control allocation parameters are adjusted dynamically to ensure\nthe satellite always approaches the target from the side with two operative\nthrusters to achieve a stable control. The effectiveness of the proposed\ndynamic programming control is compared with a Lyapunov-stable control method,\nwhich shows that the proposed method is more fuel-efficient in tracking th

In [66]:
dataset['test']['title'][0]

'Dynamic-Programming-Based Failure-Tolerant Control for Satellite with\n  Thrusters in 6-DOF Motion'

In [72]:
random_num = 0

actual_title = dataset['test']['title'][random_num]
actual_abstract = dataset['test']['abstract'][random_num]


# T5 uses a max_length of 512 so we cut the article to 512 tokens.
inputs = tokenizer("summarize: " + actual_abstract, return_tensors="tf", max_length=512)
outputs = model.generate(
    inputs["input_ids"], max_length=20, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True
)

print(f'Actual Title: {actual_title}\n')
print(f'Predicted Title: {tokenizer.decode(outputs[0])}\n')
print(f'Actual Abstract: {actual_abstract}')

# print(tokenizer.decode(outputs[0]))

Actual Title: Dynamic-Programming-Based Failure-Tolerant Control for Satellite with
  Thrusters in 6-DOF Motion

Predicted Title: <pad> Dynamic-programming-based optimal policies for thruster-driven spacecraft

Actual Abstract:   In this paper, a dynamic-programming approach to the coupled translational
and rotational control of thruster-driven spacecraft is studied. To reduce the
complexity of the problem, dynamic-programming-based optimal policies are
calculated using decoupled position and attitude dynamics with generalized
forces and torques as controls. A quadratic-programming-based control
allocation is then used to map the controls to actuator commands. To control
the spacecraft in the event of thruster failure, both the dynamic programming
policies and control allocation are reconfigured to cope with the losses in
controls. The control allocation parameters are adjusted dynamically to ensure
the satellite always approaches the target from the side with two operative
thrusters t