# Experiment to compare OCCAMS and GPT on extractive and hybrid summarization on the CNN/DM dataset

## Design:

- Length will be determined by the human summaries. We will only look at documents that have summaries within 10% of the median. This will help us to have consistent lengths.
- Of these selected documents, we will find the average number of sentences in the document. Then we will use this as our 'sentence_budget' for GPT. It will select the most important sentences to form a summary.
- OCCAMS will then be used on the documents, given a character_budget of the 75% of the GPT extractive summaries. This summary will be used for the hybrid summary.
- GPT will abstractively summarize both of these, with a character budget of the target length. We will iterate to ensure that the summaries to not exceed this budget.

For now, I'm only going to aim for 250 documents, with 50 of those being assurance against errors from the API

# Load and Filter Data

In [1]:
from datasets import load_dataset
import math
import ast
import re
from nltk import sent_tokenize
import pandas as pd
import json
from tqdm import tqdm

tqdm.pandas(desc='bar')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Adapted from supervised_occams_example_bootscads.ipynb by NeilM

DATASET = ("cnn_dailymail", "3.0.0")
SPLITS = [
    "train[0%:95%]",
    "train[95%:100%]",
]  # take last 5% of train as validation data
train, test = load_dataset(*DATASET, split=SPLITS)
print(f"Loaded {DATASET[0]}.   len(train)={len(train)}    len(test)={len(test)}")

Found cached dataset cnn_dailymail (/home/vaburban/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|██████████| 2/2 [00:00<00:00, 330.59it/s]

Loaded cnn_dailymail.   len(train)=272757    len(test)=14356





In [2]:
import numpy as np
from occams.summarize import SummaryUnits

# We decide to measure our documents in characers (as opposed to words)
UNITS = SummaryUnits.CHARS


def filter_data(
    data,
    min_sum_len,
    max_sum_len,
    min_doc_len,
    doc_key="article",
    sum_key="highlights",
    UNITS=UNITS,
):
    # cnn data has 'articles' and 'highlights' instead of 'document' and 'summary'
    data = data.filter(
        lambda ex: (min_sum_len <= UNITS.len(ex[sum_key]) <= max_sum_len)
        and (UNITS.len(ex[doc_key]) > min_doc_len)
    )
    return data


def get_target_length(summaries, UNITS=UNITS, length_quantile=0.5):
    num_units = [UNITS.len(summary) for summary in summaries]
    target_length = int(np.quantile(num_units, length_quantile))
    return num_units, target_length


def truncate_data(data, max_length):
    if len(data) > max_length:
        data = data.select(range(max_length))
    return data

In [4]:
# Filter the training data
train = filter_data(train, min_sum_len=100, max_sum_len=2500, min_doc_len=100)

# Use filtered training data to get the target length for our computed summaries
num_units, target_length = get_target_length(train["highlights"])

# Filter the testing data
test = filter_data(
    test,
    min_sum_len=int(0.9 * target_length),
    max_sum_len=int(target_length),
    min_doc_len=100,
)

# truncate data (to speed up computations)
train = truncate_data(train, max_length=2000)
test = truncate_data(test, max_length=1000)

print(f"The target length is {target_length} in {UNITS.name.lower()}.")
print(f"Now, we have {len(train)} documents and summaries in our training data")
print(f"Now, we have {len(test)} documents and summaries in our testing data")

Loading cached processed dataset at /home/vaburban/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-0e575c24af0409e8.arrow
Loading cached processed dataset at /home/vaburban/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-40951ba7536e0989.arrow


The target length is 279 in chars.
Now, we have 2000 documents and summaries in our training data
Now, we have 1000 documents and summaries in our testing data


In [7]:
# Entire Dataframe
# cnn_summaries = pd.DataFrame(cnn_dataset).rename(columns={"article": "text", "highlights": "human_summary"})

# SAMPLING A SUBSET FOR EXPERIMENT
cnn_summaries = pd.DataFrame(train).sample(250)

sentence_budget = math.ceil(cnn_summaries['highlights'].apply(sent_tokenize).str.len().median())

# Generating GPT Summaries

In [8]:
print(f"Target summary length in sentences is {sentence_budget}")

Target summary length in sentences is 4


In [4]:
# code credit Jacob Baxter
import requests
with open('../artifacts/gpt_key.txt') as f:
    key_file = f.readlines()
KEY = key_file[0]
def gpt_summarize(text, prompt):
    url = "https://apiproxy.ncsu-las.net/APIGateway/openai/v1/chat/completions"

    headers = {
        "Content-Type": "application/json",
        "LAS-API-Token": KEY
    }

    data = {
      "model": "gpt-3.5-turbo",
      "messages": [{"role": "user", "content": prompt.format(text=text)}],
      "temperature": 0.7
    }

    response = requests.post(url, headers=headers, data=json.dumps(data))
    try:
        summ = response.json()["choices"][0]["message"]["content"]
    except:
        print("ERROR")
        summ = "No summary possible."
    return summ

In [10]:
%%time

extractive_prompt = """Please identify the {sentence_budget} most important sentences from the text, and present them in a Python list.

Text:
{text}

Output:
""".format(sentence_budget=sentence_budget, text='{text}')


cnn_summaries['gpt_e_summ'] = cnn_summaries.article.progress_apply(lambda x: gpt_summarize(x, extractive_prompt))

bar:  12%|█▏        | 31/250 [02:48<19:21,  5.30s/it]

ERROR


bar:  33%|███▎      | 82/250 [07:48<37:56, 13.55s/it]

ERROR


bar:  36%|███▌      | 89/250 [08:47<36:07, 13.46s/it]

ERROR


bar:  38%|███▊      | 95/250 [09:42<35:20, 13.68s/it]

ERROR


bar:  38%|███▊      | 96/250 [09:50<30:12, 11.77s/it]

ERROR


bar:  40%|███▉      | 99/250 [10:06<19:12,  7.63s/it]

ERROR


bar:  52%|█████▏    | 129/250 [12:42<10:19,  5.12s/it]

ERROR


bar: 100%|██████████| 250/250 [23:44<00:00,  5.70s/it]

CPU times: user 15 s, sys: 176 ms, total: 15.1 s
Wall time: 23min 44s





In [18]:
def retry_e_gpt(row):
    x = row['gpt_e_summ']

    if x=='No summary possible.':
        return gpt_summarize(row['article'], extractive_prompt)
    else:
        return x

print(len(cnn_summaries[cnn_summaries['gpt_e_summ']=='No summary possible.']))
cnn_summaries['gpt_e_summ'] = cnn_summaries.progress_apply(lambda x: retry_e_gpt(x), axis=1)
print(len(cnn_summaries[cnn_summaries['gpt_e_summ']=='No summary possible.']))

7


bar: 100%|██████████| 250/250 [00:36<00:00,  6.86it/s]

0





In [20]:
import ast
def parse_list(x):
    try:
        new_str = " ".join(ast.literal_eval(x))
        return new_str
    except:
        print('ERROR')
        # print(x)
        new_str = 'ERROR'
        return new_str
    finally:
        return new_str
    
cnn_summaries['gpt_e_summ'] = cnn_summaries.gpt_e_summ.progress_apply(lambda x: parse_list(x))
len(cnn_summaries[cnn_summaries['gpt_e_summ']=='ERROR'])

bar: 100%|██████████| 250/250 [00:00<00:00, 41457.16it/s]

ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR
ERROR





46

In [30]:
# Retrying the summaries that generated faulty Python lists
# We can rerun this cell as many times as we want to get the most summaries possible

def retry_extractive(row):
    x = row['gpt_e_summ']
    if x == 'ERROR' or x == "":
        return parse_list(gpt_summarize(row['article'], extractive_prompt))
    else:
        return x

    
print(len(cnn_summaries[cnn_summaries['gpt_e_summ']=='ERROR']))
cnn_summaries['gpt_e_summ'] = cnn_summaries.progress_apply(lambda x: retry_extractive(x), axis=1)

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (3158815999.py, line 14)

In [33]:
cnn_summaries = cnn_summaries[(cnn_summaries['gpt_e_summ']!='ERROR') & (cnn_summaries['gpt_e_summ']!='') ]
len(cnn_summaries)

247

In [34]:
cnn_summaries.to_parquet('datasets/cnn_experiment.parquet')

# Generating OCCAMS Summaries

## Final Extractive Summary
OCCAMS summary not meant for hybrid methods, target length is target_length

In [35]:
from occams.nlp import TermOrder
from occams.nlp import process_document
from occams.summarize import SummaryUnits, extract_summary
UNITS = SummaryUnits.CHARS

In [36]:
# avg_len = sum(cnn_summaries['gpt_e_summ'].str.len())/len(cnn_summaries)
char_budget = target_length
# char_budget = math.floor(cnn_summaries.gpt_e_summ.str.len().quantile(.75))

from nltk.tokenize import sent_tokenize

# Adapted from Jacob Baxter
def occams_summarize(document):
    return extract_summary([process_document(document, TermOrder.BIGRAMS)], budget=char_budget, units=UNITS).summary()
    

In [37]:
%%time 
cnn_summaries["occams_summ_f"] = cnn_summaries.article.progress_apply(occams_summarize)
cnn_summaries.head()

bar: 100%|██████████| 247/247 [00:02<00:00, 84.74it/s]

CPU times: user 2.92 s, sys: 4 ms, total: 2.92 s
Wall time: 2.92 s





Unnamed: 0,article,highlights,id,gpt_e_summ,occams_summ_f
1881,"NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...",Nonnie Dotson was visiting brother in suburb o...,9880666126c7836dbe99b0b42becfee3dcab88cf,"Nonnie Dotson, a nurse in the U.S. Air Force, ...","NEW YORK (CNN) -- Nonnie Dotson, a nurse in th..."
1332,(CNN) -- With a week to go before Election Day...,Obama leads by 8 points in most recent CNN nat...,c24025591e20b5d21062d1a1ec8fbf4686b963b7,The most recent national CNN poll of polls sho...,"In Florida, a CNN/USA Today/Gallup poll conduc..."
444,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers die...","NEW: As Iraq war enters sixth year, American d...",d80e6a3be826df05df00a87f49cc426fd597f085,Four U.S. soldiers died in a roadside bombing ...,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers died..."
86,"ALGIERS, Algeria (CNN) -- Rescuers are sifting...",Two bombs explode in Algerian capital near gov...,4cf51ce9372dff8ff7f44f098eab1c1d7569af7a,Rescuers are sifting through the rubble of the...,"ALGIERS, Algeria (CNN) -- Rescuers are sifting..."
958,(CNN) -- The crib in Ellen Darcy's Boston home...,"Guatemala and Vietnam say corruption, baby-ste...",7551503e7e57bd519913b0df90ca1e80d5305b05,Guatemala has announced it will conduct a case...,(CNN) -- The crib in Ellen Darcy's Boston home...


## Extractive Summary for Hybrid Summarization

In [38]:
char_budget = math.floor(cnn_summaries.gpt_e_summ.str.len().quantile(.75))

In [39]:
char_budget

695

In [40]:
%%time 
cnn_summaries["occams_summ_h"] = cnn_summaries.article.progress_apply(occams_summarize)
cnn_summaries.head()

bar: 100%|██████████| 247/247 [00:02<00:00, 86.79it/s]

CPU times: user 2.85 s, sys: 0 ns, total: 2.85 s
Wall time: 2.85 s





Unnamed: 0,article,highlights,id,gpt_e_summ,occams_summ_f,occams_summ_h
1881,"NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...",Nonnie Dotson was visiting brother in suburb o...,9880666126c7836dbe99b0b42becfee3dcab88cf,"Nonnie Dotson, a nurse in the U.S. Air Force, ...","NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...","NEW YORK (CNN) -- Nonnie Dotson, a nurse in th..."
1332,(CNN) -- With a week to go before Election Day...,Obama leads by 8 points in most recent CNN nat...,c24025591e20b5d21062d1a1ec8fbf4686b963b7,The most recent national CNN poll of polls sho...,"In Florida, a CNN/USA Today/Gallup poll conduc...",(CNN) -- With a week to go before Election Day...
444,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers die...","NEW: As Iraq war enters sixth year, American d...",d80e6a3be826df05df00a87f49cc426fd597f085,Four U.S. soldiers died in a roadside bombing ...,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers died...","BAGHDAD, Iraq (CNN) -- Four U.S. soldiers died..."
86,"ALGIERS, Algeria (CNN) -- Rescuers are sifting...",Two bombs explode in Algerian capital near gov...,4cf51ce9372dff8ff7f44f098eab1c1d7569af7a,Rescuers are sifting through the rubble of the...,"ALGIERS, Algeria (CNN) -- Rescuers are sifting...","ALGIERS, Algeria (CNN) -- Rescuers are sifting..."
958,(CNN) -- The crib in Ellen Darcy's Boston home...,"Guatemala and Vietnam say corruption, baby-ste...",7551503e7e57bd519913b0df90ca1e80d5305b05,Guatemala has announced it will conduct a case...,(CNN) -- The crib in Ellen Darcy's Boston home...,(CNN) -- The crib in Ellen Darcy's Boston home...


In [41]:
cnn_summaries.to_parquet('datasets/cnn_experiment.parquet')

In [3]:
cnn_summaries = pd.read_parquet('datasets/cnn_experiment.parquet')

# Hybrid Summarization

To ensure that the length of these summaries is appropriate, we will need to iterate on the length and force GPT to not give us character counts. For the sake of experimentation, I will keep the full length summary too.

## Hybrid Summary: OCCAMS

In [6]:
%%time

# Note that we are using the SAME prompt for both OCCAMS and ChatGPT based extractive summaries
hybrid_prompt = """You are a  summarizer that follows the output pattern. Please summarize this text.

Text:
{text}

Summary:
""".format(text='{text}')

cnn_summaries['occams_gpt_summ'] = cnn_summaries.occams_summ_h.progress_apply(lambda x: gpt_summarize(x, hybrid_prompt))

bar:   6%|▋         | 16/247 [01:23<46:06, 11.98s/it]

ERROR


bar:  32%|███▏      | 78/247 [05:08<32:51, 11.66s/it]

ERROR


bar:  60%|█████▉    | 148/247 [09:31<19:15, 11.67s/it]

ERROR


bar:  74%|███████▎  | 182/247 [11:55<12:22, 11.43s/it]

ERROR


bar:  84%|████████▍ | 208/247 [13:52<07:27, 11.47s/it]

ERROR


bar:  85%|████████▌ | 210/247 [14:02<05:04,  8.24s/it]

ERROR


bar: 100%|██████████| 247/247 [16:20<00:00,  3.97s/it]

CPU times: user 15.6 s, sys: 198 ms, total: 15.8 s
Wall time: 16min 20s





In [8]:
# Too many errors, retrying
# Run this cell as many times as needed to get the desired number of good summaries
def retry_occams_gpt(row):
    x = row['occams_gpt_summ']

    if x=='No summary possible.':
        return gpt_summarize(row['occams_summ_h'], hybrid_prompt)
    else:
        return x

print(len(cnn_summaries[cnn_summaries['occams_gpt_summ']!='No summary possible.']))
                             
cnn_summaries['occams_gpt_summ'] = cnn_summaries.progress_apply(lambda x: retry_occams_gpt(x), axis=1)
print(len(cnn_summaries[cnn_summaries['occams_gpt_summ']!='No summary possible.']))

241


bar: 100%|██████████| 247/247 [00:20<00:00, 12.17it/s]

247





In [9]:
cnn_summaries = cnn_summaries[cnn_summaries['occams_gpt_summ']!='No summary possible.']
len(cnn_summaries)

247

## Hybrid Summary: GPT_E

In [19]:
%%time
cnn_summaries['gpt_gpt_summ'] = cnn_summaries.gpt_e_summ.progress_apply(lambda x: gpt_summarize(x, hybrid_prompt))
print(len(cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']))

bar:   5%|▍         | 12/247 [01:05<46:25, 11.85s/it]

ERROR


bar:   6%|▌         | 15/247 [01:47<59:05, 15.28s/it]

ERROR


bar:  26%|██▋       | 65/247 [04:27<09:45,  3.22s/it]

ERROR


bar:  39%|███▉      | 97/247 [06:40<28:07, 11.25s/it]

ERROR


bar:  72%|███████▏  | 179/247 [11:41<13:01, 11.49s/it]

ERROR


bar:  78%|███████▊  | 193/247 [12:53<10:44, 11.93s/it]

ERROR


bar:  87%|████████▋ | 214/247 [14:32<06:09, 11.19s/it]

ERROR


bar:  91%|█████████ | 225/247 [15:36<04:15, 11.63s/it]

ERROR


bar:  94%|█████████▎| 231/247 [16:17<03:07, 11.72s/it]

ERROR


bar:  95%|█████████▍| 234/247 [16:27<01:19,  6.12s/it]

ERROR


bar:  99%|█████████▉| 245/247 [17:31<00:23, 11.61s/it]

ERROR


bar: 100%|██████████| 247/247 [17:42<00:00,  4.30s/it]

236
CPU times: user 14.9 s, sys: 168 ms, total: 15 s
Wall time: 17min 42s





In [20]:
# Too many errors, retrying
# Run this cell as many times as needed to get the desired number of good summaries
def retry_gpt(row):
    x = row['gpt_gpt_summ']

    if x=='No summary possible.':
        return gpt_summarize(row['gpt_e_summ'], hybrid_prompt)
    else:
        return x
    
print(len(cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']))
cnn_summaries['gpt_gpt_summ'] = cnn_summaries.progress_apply(lambda x: retry_gpt(x), axis=1)
print(len(cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']))

236


bar: 100%|██████████| 247/247 [00:33<00:00,  7.37it/s]

247





In [21]:
cnn_summaries = cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']
len(cnn_summaries)

247

In [29]:
cnn_summaries.head()

In [38]:
cnn_summaries.to_parquet('datasets/cnn_experiment.parquet')

## Controlling Length of Hybrid Summaries

In [39]:
# Renaming the hybrid summaries since we want to get shorter summaries. Avoiding confusion

cnn_summaries = cnn_summaries.rename(columns={"gpt_gpt_summ": "gpt_gpt_summ_full", "occams_gpt_summ": "occams_gpt_summ_full"})

print(cnn_summaries['occams_gpt_summ_full'].str.len().describe())
print(cnn_summaries['gpt_gpt_summ_full'].str.len().describe())

count    247.000000
mean     375.995951
std       91.514871
min      114.000000
25%      318.000000
50%      366.000000
75%      434.500000
max      638.000000
Name: occams_gpt_summ_full, dtype: float64
count    247.000000
mean     367.821862
std      109.896998
min      160.000000
25%      295.000000
50%      358.000000
75%      432.000000
max      783.000000
Name: gpt_gpt_summ_full, dtype: float64


In [47]:
compress_prompt = """Make this summary less than {target_length} characters, following the output format:

Text:
{text}

Summary:
""".format(text='{text}', target_length=target_length)

def compress_summary(x):
    while len(x) > target_length:
        new_x = gpt_summarize(x, compress_prompt)
        if new_x != 'No summary possible.':
            x = re.sub(r"\s\(\d+ characters\)|Characters\)", "", new_x)
    return x

cnn_summaries['occams_gpt_summ'] = cnn_summaries.occams_gpt_summ_full.progress_apply(lambda x: compress_summary(x))
print(len(cnn_summaries[cnn_summaries['occams_gpt_summ']!='No summary possible.']))

bar:  40%|███▉      | 98/247 [04:38<11:27,  4.62s/it]

ERROR
ERROR


bar:  42%|████▏     | 104/247 [05:27<12:19,  5.17s/it]

ERROR


bar:  60%|█████▉    | 147/247 [08:50<07:32,  4.52s/it]

ERROR


bar: 100%|██████████| 247/247 [14:20<00:00,  3.49s/it]

247





In [50]:
cnn_summaries = cnn_summaries[cnn_summaries['occams_gpt_summ']!='No summary possible.']
cnn_summaries.head()

Unnamed: 0,article,highlights,id,gpt_e_summ,occams_summ_f,occams_summ_h,occams_gpt_summ_full,gpt_gpt_summ_full,occams_gpt_summ
1881,"NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...",Nonnie Dotson was visiting brother in suburb o...,9880666126c7836dbe99b0b42becfee3dcab88cf,"Nonnie Dotson, a nurse in the U.S. Air Force, ...","NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...","NEW YORK (CNN) -- Nonnie Dotson, a nurse in th...","Nonnie Dotson, a nurse in the U.S. Air Force, ...","Nonnie Dotson, a nurse in the U.S. Air Force, ...",Nurse Nonnie Dotson disappeared amid a child s...
1332,(CNN) -- With a week to go before Election Day...,Obama leads by 8 points in most recent CNN nat...,c24025591e20b5d21062d1a1ec8fbf4686b963b7,The most recent national CNN poll of polls sho...,"In Florida, a CNN/USA Today/Gallup poll conduc...",(CNN) -- With a week to go before Election Day...,Recent national polls show Democratic presiden...,The most recent national CNN poll of polls sho...,Recent national polls show Obama leading McCai...
444,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers die...","NEW: As Iraq war enters sixth year, American d...",d80e6a3be826df05df00a87f49cc426fd597f085,Four U.S. soldiers died in a roadside bombing ...,"BAGHDAD, Iraq (CNN) -- Four U.S. soldiers died...","BAGHDAD, Iraq (CNN) -- Four U.S. soldiers died...",Four U.S. soldiers were killed in a roadside b...,Four U.S. soldiers were killed in a roadside b...,4 US soldiers and 8 Pentagon contractors were ...
86,"ALGIERS, Algeria (CNN) -- Rescuers are sifting...",Two bombs explode in Algerian capital near gov...,4cf51ce9372dff8ff7f44f098eab1c1d7569af7a,Rescuers are sifting through the rubble of the...,"ALGIERS, Algeria (CNN) -- Rescuers are sifting...","ALGIERS, Algeria (CNN) -- Rescuers are sifting...",A powerful bomb destroyed the United Nations h...,A bomb attack has caused significant damage to...,A powerful bomb by al Qaeda destroyed the UN h...
958,(CNN) -- The crib in Ellen Darcy's Boston home...,"Guatemala and Vietnam say corruption, baby-ste...",7551503e7e57bd519913b0df90ca1e80d5305b05,Guatemala has announced it will conduct a case...,(CNN) -- The crib in Ellen Darcy's Boston home...,(CNN) -- The crib in Ellen Darcy's Boston home...,Guatemala is conducting a review of all pendin...,Guatemala is conducting a review of all pendin...,"Guatemala reviews pending foreign adoptions, m..."


In [51]:
cnn_summaries['gpt_gpt_summ'] = cnn_summaries.gpt_gpt_summ_full.progress_apply(lambda x: compress_summary(x))
print(len(cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']))

bar:  11%|█▏        | 28/247 [01:45<14:07,  3.87s/it]

ERROR


bar:  33%|███▎      | 81/247 [04:28<09:52,  3.57s/it]

ERROR


bar:  36%|███▋      | 90/247 [05:34<11:37,  4.44s/it]

ERROR
ERROR


bar:  40%|████      | 99/247 [06:43<13:47,  5.59s/it]

ERROR


bar:  40%|████      | 100/247 [06:48<13:40,  5.58s/it]

ERROR


bar:  62%|██████▏   | 152/247 [10:01<04:03,  2.57s/it]

ERROR


bar:  76%|███████▌  | 187/247 [11:59<01:39,  1.65s/it]

ERROR


bar:  78%|███████▊  | 193/247 [12:15<02:36,  2.90s/it]

ERROR


bar:  80%|███████▉  | 197/247 [13:01<05:48,  6.97s/it]

ERROR


bar: 100%|██████████| 247/247 [15:11<00:00,  3.69s/it]

247





In [52]:
cnn_summaries = cnn_summaries[cnn_summaries['gpt_gpt_summ']!='No summary possible.']

In [54]:
print(cnn_summaries['occams_gpt_summ'].str.len().describe())
print(cnn_summaries['gpt_gpt_summ'].str.len().describe())

count    247.000000
mean     208.291498
std       32.692893
min      114.000000
25%      188.000000
50%      212.000000
75%      233.000000
max      257.000000
Name: occams_gpt_summ, dtype: float64
count    247.000000
mean     209.153846
std       32.955634
min       97.000000
25%      187.000000
50%      214.000000
75%      238.000000
max      257.000000
Name: gpt_gpt_summ, dtype: float64


In [55]:
cnn_summaries.to_parquet('datasets/cnn_experiment.parquet')

# BONUS: Pure Abstractive

In [7]:
%%time
num_units, target_length = get_target_length(cnn_summaries["highlights"])

# Prompt adapted from Extractive Summarization via ChatGPT for Faithful Summary Generation (Zhang et al)
abstractive_prompt = """You are an abstractive summarizer that follows the output pattern. Please write a summary for the following text in {char_budget} characters or less.

Text:
{text}

Summary:
""".format(char_budget=target_length, text='{text}')

cnn_summaries['gpt_a_summ'] = cnn_summaries.article.progress_apply(lambda x: gpt_summarize(x, abstractive_prompt))
print(len(cnn_summaries[cnn_summaries['gpt_a_summ']!='No summary possible.']))


bar:   1%|          | 2/247 [00:31<1:03:19, 15.51s/it]

ERROR


bar:  17%|█▋        | 43/247 [04:14<41:04, 12.08s/it] 

ERROR


bar:  23%|██▎       | 58/247 [05:18<14:04,  4.47s/it]

ERROR


bar: 100%|██████████| 247/247 [19:54<00:00,  4.84s/it]

244
CPU times: user 16.3 s, sys: 189 ms, total: 16.5 s
Wall time: 19min 54s





In [9]:
def retry_gpt(row):
    x = row['gpt_a_summ']

    if x=='No summary possible.':
        return gpt_summarize(row['article'], abstractive_prompt)
    else:
        return x
    
print(len(cnn_summaries[cnn_summaries['gpt_a_summ']!='No summary possible.']))
cnn_summaries['gpt_a_summ'] = cnn_summaries.progress_apply(lambda x: retry_gpt(x), axis=1)
print(len(cnn_summaries[cnn_summaries['gpt_a_summ']!='No summary possible.']))

244


bar: 100%|██████████| 247/247 [00:08<00:00, 28.80it/s]

247





In [18]:
cnn_summaries = cnn_summaries.rename(columns={"gpt_a_summ": "gpt_a_summ_full"})

In [22]:

compress_prompt = """Make this summary less than {target_length} characters, following the output format:

Text:
{text}

Summary:
""".format(text='{text}', target_length=target_length)

def compress_summary(x):
    while len(x) > target_length:
        new_x = gpt_summarize(x, compress_prompt)
        if new_x != 'No summary possible.':
            x = re.sub(r"\s\(\d+ characters\)|Characters\)", "", new_x)
    return x

cnn_summaries['gpt_a_summ'] = cnn_summaries.gpt_a_summ_full.progress_apply(lambda x: compress_summary(x))
print(len(cnn_summaries[cnn_summaries['gpt_a_summ']!='No summary possible.']))

bar:   9%|▊         | 21/247 [01:59<16:07,  4.28s/it]

ERROR


bar:  39%|███▉      | 96/247 [07:32<10:43,  4.26s/it]

ERROR


bar:  48%|████▊     | 119/247 [09:33<07:49,  3.67s/it]

ERROR


bar:  54%|█████▍    | 134/247 [10:32<05:09,  2.74s/it]

ERROR


bar:  55%|█████▌    | 136/247 [11:12<18:49, 10.17s/it]

ERROR
ERROR


bar:  60%|█████▉    | 148/247 [12:19<05:08,  3.12s/it]

ERROR


bar:  69%|██████▉   | 170/247 [14:09<06:42,  5.23s/it]

ERROR


bar:  83%|████████▎ | 206/247 [17:35<02:33,  3.75s/it]

ERROR


bar: 100%|██████████| 247/247 [20:54<00:00,  5.08s/it]

247





In [23]:
cnn_summaries.to_parquet('datasets/cnn_experiment.parquet')

In [24]:
print(cnn_summaries['gpt_a_summ'].str.len().describe())

count    247.000000
mean     213.453441
std       30.756532
min      109.000000
25%      193.000000
50%      217.000000
75%      240.000000
max      257.000000
Name: gpt_a_summ, dtype: float64
