# Part 0: Basic Flan-T5 model setup

In [1]:
#!pip install transformers
#!pip install datasets
#!pip install rouge_score
#!pip install sentencepiece

In [2]:
import torch
import pandas as pd
import numpy as np
import nltk

from torch import nn
from torch.utils.data import DataLoader
from transformers import Trainer, AutoModelForSeq2SeqLM, AutoTokenizer, TrainingArguments, \
  DataCollatorForLanguageModeling, T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

from sklearn.metrics import mean_squared_error, precision_score
from datasets import load_dataset, Dataset, load_metric
from sklearn.model_selection import train_test_split
from tqdm import tqdm

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/leczhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

model.cuda()
model.config.max_length = 300
model.config.min_length = 100
model.config.no_repeat_ngram_size = 3

In [4]:
data = pd.read_csv("review_simplified.csv")
train_df, test_df = train_test_split(data, test_size=0.2, random_state=1)

train_data = Dataset.from_pandas(train_df)
test_data = Dataset.from_pandas(test_df)

In [5]:
metric = load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    print(decoded_preds)
    print(decoded_labels)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

  metric = load_metric("rouge")


In [6]:
def evaluate_dataset(test_dataset):
    
    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=8)

    pred_label = []
    true_label = []

    for i,batch in enumerate(tqdm(dataloader)):
        batch['input_ids'] = batch['input_ids'].cuda()
        batch['attention_mask'] = batch['attention_mask'].cuda()
        predictions = model.generate(**{'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']})
        labels = batch['labels']
        pred_label.extend(predictions.cpu().tolist())
        true_label.extend(labels.cpu().tolist())

    return compute_metrics([pred_label, true_label])

# Part1: Without TextRank + Prompt4

In [7]:
prefix = "Write an introduction from reviews about "

def tokenize_function(examples):
    
    inputs = []
    for i in range(len(examples["Review"])):
        inputs.append(prefix + examples["Name"][i] + " : " + examples["Review"][i])
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [04:51<00:00,  4.56s/it]




{'rouge1': 24.0364,
 'rouge2': 4.6408,
 'rougeL': 13.8634,
 'rougeLsum': 22.8319,
 'gen_len': 1.0}

# Part 2: Prompt Selection

## Prompt1: summarize:

In [7]:
prefix = "summarize: "

def tokenize_function(examples):
    
    inputs = [prefix + text for text in examples["review_simple"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [03:31<00:00,  3.30s/it]


['Great place to play pinball games. Great place for a quick pinball game. Great selection of pinball machines. Great arcade games. Good prices. Great staff. Great games. Awesome place to visit. Great price. Great pinball machine selection. Great game. Good price. Good pinball experience. Great location. Great people. Great prices. Good games. Best pinball place I have ever been to. Great experience. Good value. Great time. Great memories. Great Pinball Machines.', '.. Last weekend I took my teen boys and their friend to explore Moran State Park for the afternoon. The trails were very well taken care of..... Last weekend i took my friends and i went to Orcas Island for camping, hiking, & photography. We had heard about Moran...... last weekend a friend & i... last week i explored Moran....... last weekend we hiked to one of the lakes in the park..... Last weekend we took the ferry to Orgas Island for Camping, hiking & Photography. We hiked here two out of three days on Orcas....', "Gre

{'rouge1': 17.518,
 'rouge2': 3.1934,
 'rougeL': 10.6393,
 'rougeLsum': 16.6353,
 'gen_len': 1.0}

## Prompt2: Paraphrase from an objective perspective: 

In [8]:
prefix = "Paraphrase from an objective perspective: "

def tokenize_function(examples):
    
    inputs = [prefix + text for text in examples["review_simple"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [04:30<00:00,  4.23s/it]




{'rouge1': 21.8553,
 'rouge2': 4.097,
 'rougeL': 12.4642,
 'rougeLsum': 20.6361,
 'gen_len': 1.0}

## Prompt3: Write an introduction from reviews:

In [9]:
prefix = "Write an introduction from reviews: "

def tokenize_function(examples):
    
    inputs = [prefix + text for text in examples["review_simple"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [04:58<00:00,  4.67s/it]




{'rouge1': 24.3706,
 'rouge2': 4.6297,
 'rougeL': 13.6927,
 'rougeLsum': 23.0659,
 'gen_len': 1.0}

## Prompt4: "Write an introduction from reviews about " + NAME + TEXT

In [11]:
prefix = "Write an introduction from reviews about "

def tokenize_function(examples):
    
    inputs = []
    for i in range(len(examples["review_simple"])):
        inputs.append(prefix + examples["Name"][i] + " : " + examples["review_simple"][i])
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [04:36<00:00,  4.32s/it]




{'rouge1': 24.9601,
 'rouge2': 5.1326,
 'rougeL': 14.3027,
 'rougeLsum': 23.6322,
 'gen_len': 1.0}

# Part 3: Fine-tuned model evaluation

In [6]:
model = AutoModelForSeq2SeqLM.from_pretrained("./results/checkpoint-1000")
tokenizer = AutoTokenizer.from_pretrained("./results/checkpoint-1000")

prefix = "Write an introduction from reviews about "

def tokenize_function(examples):
    
    #inputs = [prefix + text for text in examples["review_simple"]]
    inputs = []
    for i in range(len(examples["review_simple"])):
        inputs.append(prefix + examples["Name"][i] + " : " + examples["review_simple"][i])
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["Introduction"], max_length=512, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

test_dataset = test_data.map(tokenize_function, batched=True)

evaluate_dataset(test_dataset)

Map:   0%|          | 0/505 [00:00<?, ? examples/s]

100%|██████████| 64/64 [07:06<00:00,  6.67s/it] 


['The Pinball Hall of Fame is located in Las Vegas, Nevada. The Pin Ball Hall of fame is a museum dedicated to pinball. The museum is open from 10am to 5pm daily. The pinball hall of fame was founded in 1939 by the late Countess of Las Vegas. The hall of Fame was built in 1931 and is located at 10801 Las Vegas Boulevard. The building was built by the Las Vegas Board of Supervisors. The Hall of the Fame is open to the public from 10 am to 5 pm daily. There are a number of pinball machines and arcade games available. The arcade games are based on the arcade games of the 1980s. There is sandboxes, arcades, and arcades. Several of the pinball games are available for purchase. The main attraction is the Pinball Museum. The Museum is located on the Las Las Vegas Strip. The exhibit is based in the Pin Ball Museum. There were a total of 108 pinball machine and arcade machines. The center of the museum is located near the Las vegas strip. The Center of the Pinbow Hall of Excellence is located n

{'rouge1': 37.5372,
 'rouge2': 10.7559,
 'rougeL': 20.609,
 'rougeLsum': 36.193,
 'gen_len': 1.0}