In [1]:
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from datasets import load_dataset, load_metric
from rouge import Rouge
import numpy as np

In [2]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()
# checkpoint = torch.load("./gpt-2-original/pytorch_model.bin")
# model.load_state_dict(checkpoint)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("Preparing Model ...")

Preparing Model ...


In [3]:
def evaluate_sst2(model, tokenizer, device=device):
    possible_classes = ["positive", "negative"]
    def classify_text(text, possible_outputs):
        framed_texts = [f"{text} This text is {output}." for output in possible_outputs]
        encoded_inputs = [tokenizer.encode(t, return_tensors="pt").to(device) for t in framed_texts]
        
        logits_for_outputs = []

        for encoded_input in encoded_inputs:
            with torch.no_grad():
                outputs = model(encoded_input)
                logits = outputs.logits
            logits_for_outputs.append(logits[0, -1, :].squeeze().cpu().numpy())
        
        token_ids = [tokenizer.encode(output)[0] for output in possible_outputs]
        class_logits = [logits[token_id] for logits, token_id in zip(logits_for_outputs, token_ids)]
        return possible_outputs[class_logits.index(max(class_logits))]
    dataset = load_dataset("glue", "sst2")
    train_data = dataset['train']

    ground_truth = []
    pred_labels = []

    for data in tqdm(train_data):
        logits = data['label']
        prediction = classify_text(data['sentence'], possible_classes)
        if prediction == "positive":
            pred_labels.append(1)
        else:
            pred_labels.append(0)
        ground_truth.append(logits)
    return accuracy_score(ground_truth, pred_labels)

def evaluate_summarization(model, tokenizer, dataset="cnn_dailymail", max_tokens=150, device=device):
    def summarize(text, max_length=max_tokens):
        prompt = f"{text} TL;DR: "

        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        input_ids = input_ids.to(device)

        with torch.no_grad():
            max_length = max_length + len(input_ids[0])
            output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        summary = generated_text.split("TL;DR: ")[-1]
        return summary

    dataset = load_dataset("cnn_dailymail", "3.0.0")
    test_data = dataset['test']
    rouge_1, rouge_2, rouge_l = [], [], []
    for data in tqdm(test_data):
        article = data['article']
        summary = summarize(article)
        ground_truth = data['highlights']
        rouge = Rouge()
        scores = rouge.get_scores(summary, ground_truth, avg=True)
        score_1, score_2, score_l = scores['rouge-1']['f'], scores['rouge-2']['f'], scores['rouge-l']['f']
        rouge_1.append(score_1)
        rouge_2.append(score_2)
        rouge_l.append(score_l)
    return np.mean(rouge_1), np.mean(rouge_2), np.mean(rouge_l)

In [10]:
evaluate_summarization(model, tokenizer)

Found cached dataset cnn_dailymail (/home/minhaoj2/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/11490 [00:00<?, ?it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [4]:
def summarize(text, max_length=150):
    prompt = f"{text} TL;DR: "

    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    input_ids = input_ids.to(device)
    
    with torch.no_grad():
        max_length = max_length + len(input_ids[0])
        output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    summary = generated_text.split("TL;DR: ")[-1]
    return summary

In [5]:
text = "The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed"
print(summarize(text))

 The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed
The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load pre-trained model and tokenizer
model_name = "gpt2-large"  # You can choose other sizes as well
model = GPT2LMHeadModel.from_pretrained(model_name).eval().cuda()
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
sum(p.numel() for p in model.parameters())
# def generate_summary(article):
#     article = article + "\nTL;DR: "
#     tokens = tokenizer.tokenize(article)
#     if len(tokens) > 924:
#         tokens = tokens[-924:]
#     input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tokens)]).cuda()
#     with torch.no_grad():
#         output = model.generate(
#             input_ids, 
#             max_length=len(input_ids[0]) + 150, 
#             num_return_sequences=1, 
#             do_sample=True, 
#             top_k=2
#         )
#     generated_summary = tokenizer.decode(output[0], skip_special_tokens=True)

#     sentences = generated_summary.split('.')
#     if len(sentences) > 3:
#         return '.'.join(sentences[:3]) + '.'
#     else:
#         return generated_summary

# # Test the function
# article = "(CNN)The Palestinian Authority officially became the 123rd member of the International Criminal Court on Wednesday, a step that gives the court jurisdiction over alleged crimes in Palestinian territories. The formal accession was marked with a ceremony at The Hague, in the Netherlands, where the court is based. The Palestinians signed the ICC's founding Rome Statute in January, when they also accepted its jurisdiction over alleged crimes committed"
# print(generate_summary(article))

774030080