## GPT -- Text Summarization

In [3]:
# Import modules
from transformers import pipeline, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
import pandas as pd
from rouge_score import rouge_scorer

ImportError: cannot import name 'pipeline' from 'transformers' (/Users/cartermondy/Library/Python/3.9/lib/python/site-packages/transformers/__init__.py)

In [2]:
# Import dataset 
dataset = load_dataset("cnn_dailymail", "3.0.0")
dataset

Generating train split: 100%|██████████| 287113/287113 [00:02<00:00, 129382.58 examples/s]
Generating validation split: 100%|██████████| 13368/13368 [00:00<00:00, 140180.65 examples/s]
Generating test split: 100%|██████████| 11490/11490 [00:00<00:00, 141722.79 examples/s]


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})

In [4]:
print(dataset['train'].to_pandas().head())

                                             article  ...                                        id
0  LONDON, England (Reuters) -- Harry Potter star...  ...  42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1  Editor's note: In our Behind the Scenes series...  ...  ee8871b15c50d0db17b0179a6d2beab35065f1e9
2  MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...  ...  06352019a19ae31e527f37f7571c6dd7f0c5da37
3  WASHINGTON (CNN) -- Doctors removed five small...  ...  24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4  (CNN)  -- The National Football League has ind...  ...  7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a

[5 rows x 3 columns]


In [5]:
# Access the dataset splits
train_data = dataset['train']
validation_data = dataset['validation']
test_data = dataset['test']

# View a sample
sample = train_data[0]
print("Original Text:", sample['article'])
print("Human Summary:", sample['highlights'])

Original Text: LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK box office chart.

In [None]:
# Load the summarization pipeline with a pretrained model
summarization_pipeline = pipeline("summarization", model="facebook/bart-large-cnn")

# Perform abstractive summarization on a single sample
article = sample['article']  # Use the article text from the dataset
abstractive_summary = summarization_pipeline(article, max_length=130, min_length=30, do_sample=False)

print("Abstractive Summary:", abstractive_summary[0]['summary_text'])

In [6]:
# Load GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token

### Abstractive

In [34]:
def generate_abstractive_summary(article, max_new_tokens=150):
    # Create a summarization prompt
    prompt = f"Summarize the following article:\n\n{article}\n\nSummary:"
    
    # Tokenize the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=1024, truncation=True)
    
    # Generate summary
    outputs = model.generate(
        inputs, 
        max_new_tokens=max_new_tokens,  # Specify the number of tokens to generate
        num_return_sequences=1, 
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Decode the generated summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary.split("Summary:")[-1].strip()

# Test the function with a sample article
abstractive_summary = generate_abstractive_summary(sample['article'])
print("Abstractive Summary:", abstractive_summary)


### Extractive

In [59]:
def generate_extractive_summary(article, num_sentences=3):

    # Parse the input text
    parser = PlaintextParser.from_string(article, Tokenizer("english"))

    # Initialize the LexRank summarizer
    summarizer = LexRankSummarizer()

    # Generate a summary with 3 sentences
    summary = summarizer(parser.document, num_sentences)

    # Extract text from Sentence objects
    summary_text = " ".join([str(sentence) for sentence in summary])
    return summary_text

# Test the function with a sample article
extractive_summary = generate_extractive_summary(sample['article'])
print("Extractive Summary:", extractive_summary)

Extractive Summary: "As the Rome Statute today enters into force for the State of Palestine, Palestine acquires all the rights as well as responsibilities that come with being a State Party to the Statute. "What's objectionable is the attempts to undermine international justice, not Palestine's decision to join a treaty to which over 100 countries around the world are members." "As we have said repeatedly, we do not believe that Palestine is a state and therefore we do not believe that it is eligible to join the ICC," the State Department said in a statement.


In [60]:
# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate the abstractive summary
rouge_scores = scorer.score(sample['highlights'], abstractive_summary)
print("ROUGE Scores (Abstractive):", rouge_scores)

# Evaluate the extractive summary
rouge_scores_extractive = scorer.score(sample['highlights'], extractive_summary)
print("ROUGE Scores (Extractive):", rouge_scores_extractive)


ROUGE Scores (Abstractive): {'rouge1': Score(precision=0.20754716981132076, recall=0.3235294117647059, fmeasure=0.25287356321839083), 'rouge2': Score(precision=0.019230769230769232, recall=0.030303030303030304, fmeasure=0.023529411764705882), 'rougeL': Score(precision=0.11320754716981132, recall=0.17647058823529413, fmeasure=0.13793103448275862)}
ROUGE Scores (Extractive): {'rouge1': Score(precision=0.11578947368421053, recall=0.3235294117647059, fmeasure=0.17054263565891473), 'rouge2': Score(precision=0.010638297872340425, recall=0.030303030303030304, fmeasure=0.015748031496062992), 'rougeL': Score(precision=0.07368421052631578, recall=0.20588235294117646, fmeasure=0.10852713178294572)}


In [66]:
def is_valid_input(article, max_tokens=1024):
    """
    Check if the tokenized input is valid for the model.
    """
    prompt = f"Summarize the following article:\n\n{article[:800]}\n\nSummary:"
    inputs = tokenizer.encode(prompt, truncation=True)
    return len(inputs) <= max_tokens

def truncate_article(article, max_tokens=1024):
    prompt_prefix = "Summarize the following article:\n\n"
    prompt_suffix = "\n\nSummary:"
    
    # Calculate space for the article
    prompt_length = len(tokenizer.encode(prompt_prefix + prompt_suffix, truncation=True))
    available_space = max_tokens - prompt_length
    
    # Truncate article
    return article[:available_space]

In [68]:
results = []

for i, sample in enumerate(dataset['test'].select(range(10))):  # Process the first 8 samples
    print(f"Processing sample {i + 1}...")
    article = sample['article']
    human_summary = sample['highlights']
    
    # Validate and truncate article
    if not is_valid_input(article):
        print(f"Skipping sample {i + 1}: Input too long.")
        continue
    
    truncated_article = truncate_article(article)
    
    try:
        # Generate summaries
        abstractive_summary = generate_abstractive_summary(truncated_article)
        extractive_summary = generate_extractive_summary(truncated_article, num_sentences=3)
    except IndexError as e:
        print(f"Error processing sample {i + 1}: {e}")
        abstractive_summary = "Error in abstractive summarization"
        extractive_summary = "Error in extractive summarization"
        continue  # Optionally skip adding failed samples

    # Evaluate summaries
    rouge_abstractive = scorer.score(human_summary, abstractive_summary)
    rouge_extractive = scorer.score(human_summary, extractive_summary)
    
    # Flatten ROUGE scores
    rouge_abstractive_flat = {f"abstractive_{k}": v.fmeasure for k, v in rouge_abstractive.items()}
    rouge_extractive_flat = {f"extractive_{k}": v.fmeasure for k, v in rouge_extractive.items()}
    
    # Store results
    results.append({
        "article": article,
        "human_summary": human_summary,
        "abstractive_summary": abstractive_summary,
        "extractive_summary": extractive_summary,
        **rouge_abstractive_flat,
        **rouge_extractive_flat
    })


df = pd.DataFrame(results)
# df.to_csv("gpt2_summarization_results.csv", index=False)
# print("Results saved to 'gpt2_summarization_results.csv'.")

Processing sample 1...




Processing sample 2...
Processing sample 3...
Processing sample 4...
Processing sample 5...
Processing sample 6...
Processing sample 7...
Processing sample 8...
Processing sample 9...
Processing sample 10...


In [71]:
df

Unnamed: 0,article,human_summary,abstractive_summary,extractive_summary,abstractive_rouge1,abstractive_rouge2,abstractive_rougeL,extractive_rouge1,extractive_rouge2,extractive_rougeL
0,(CNN)The Palestinian Authority officially beca...,Membership gives the ICC jurisdiction over all...,The Palestinian government has formally become...,(CNN)The Palestinian Authority officially beca...,0.139535,0.0,0.139535,0.295652,0.159292,0.226087
1,(CNN)Never mind cats having nine lives. A stra...,"Theia, a bully breed mix, was apparently hit b...","The dog was a mix of white and black, and was ...",(CNN)Never mind cats having nine lives. A stra...,0.24,0.027027,0.146667,0.390244,0.165289,0.292683
2,"(CNN)If you've been following the news lately,...",Mohammad Javad Zarif has spent more time with ...,The Islamic Republic of Iran has agreed to a d...,"(CNN)If you've been following the news lately,...",0.097561,0.012346,0.085366,0.125,0.051282,0.1
3,(CNN)Five Americans who were monitored for thr...,17 Americans were exposed to the Ebola virus w...,"The U.S. Centers and the U, the world's larges...",The others have already gone home. They were e...,0.202381,0.012048,0.119048,0.43956,0.134831,0.32967
4,(CNN)A Duke student has admitted to hanging a ...,Student is no longer on Duke University campus...,The Duke University Police Department is inves...,(CNN)A Duke student has admitted to hanging a ...,0.262295,0.033898,0.131148,0.377778,0.113636,0.222222
5,(CNN)He's a blue chip college basketball recru...,College-bound basketball star asks girl with D...,Trey was a freshman at Western High in Kentuck...,Trey -- a star on Eastern High School's basket...,0.164179,0.030303,0.074627,0.25,0.023256,0.136364
6,(CNN)Governments around the world are using th...,Amnesty's annual death penalty report catalogs...,The United States has used death sentences to ...,(CNN)Governments around the world are using th...,0.236686,0.023952,0.118343,0.396396,0.220183,0.27027
7,"(CNN)Andrew Getty, one of the heirs to billion...",Andrew Getty's death appears to be from natura...,"The family of Andrew Andrew, who was born in 1...",The coroner's preliminary assessment is there ...,0.139535,0.035294,0.093023,0.227273,0.046154,0.121212
8,(CNN)Filipinos are being warned to be on guard...,"Once a super typhoon, Maysak is now a tropical...",The Philippines is facing a major storm surge ...,(CNN)Filipinos are being warned to be on guard...,0.3125,0.032258,0.3125,0.357143,0.121951,0.190476
9,"(CNN)For the first time in eight years, a TV l...","Bob Barker returned to host ""The Price Is Righ...",The Price is Right is a show that has been aro...,"(CNN)For the first time in eight years, a TV l...",0.137931,0.070588,0.114943,0.268657,0.123077,0.238806
