## Clickbait spoiling notebook

In [38]:
import pandas as pd
import os
from dotenv import load_dotenv
from openai import OpenAI
import tiktoken
from typing import List
import string
import re
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# VARIABLES
load_dotenv()
api_key = os.getenv("OPEN_AI_KEY")

In [3]:
def calculate_cost(prompt: str, data: str, out: str, model: str = "gpt-4o-mini") -> float:
    """
    Calculates the cost of a request to a OPEN AI model based on the prompt, data, and output

    Args:
        prompt (str): prompt sent to the LLM
        data (str): additional data sent to the LLM.
        out (str): response received from the LLM.
        model (str): model identifier used to determine the tokenizer, defaults to "gpt-4o-mini"

    Returns:
        float: total cost of the tokens used in USD
    """
    tokenizer = tiktoken.encoding_for_model(model)  
    request = str(prompt) + str(data)
    response = str(out)

    request_tokens = tokenizer.encode(request)
    response_tokens = tokenizer.encode(response)

    input_tokens = len(request_tokens)
    output_tokens = len(response_tokens)

    # costs per 1 million tokens
    cost_per_1M_input_tokens = 0.15  # $0.15 per 1M input tokens
    cost_per_1M_output_tokens = 0.60  # $0.60 per 1M output tokens

    input_cost = (input_tokens / 10**6) * cost_per_1M_input_tokens
    output_cost = (output_tokens / 10**6) * cost_per_1M_output_tokens
    total_cost = input_cost + output_cost

    print(f"Input tokens: {input_tokens}")
    print(f"Output tokens: {output_tokens}")
    print(f"Total tokens: {input_tokens + output_tokens}")
    print(f"Cost: ${total_cost:.5f}")
    return total_cost

In [4]:
spoil_df = pd.read_csv("../data/spoiling_data.csv")
print(spoil_df.shape)
tags = spoil_df["tags"].unique().tolist()
print(tags)

(3358, 6)
['passage', 'phrase', 'multi']


In [5]:
spoil_df.head()

Unnamed: 0,targetTitle,targetParagraphs,humanSpoiler,spoiler,tags,spoilerPositions
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",It’ll be just like old times this weekend for ...,They Threw A Football,how about that morning we go throw?,passage,"[[[3, 151], [3, 186]]]"
1,Hole In Ozone Layer Expected To Make Full Reco...,2070 is shaping up to be a great year for Moth...,2070,2070,phrase,"[[[0, 0], [0, 4]]]"
2,Intellectual Stimulation Trumps Money For Empl...,"Despite common belief, money isnt the key to e...",Intellectual stimulation,intellectual stimulation,phrase,"[[[1, 186], [1, 210]]]"
3,Heres what happens if your Apple AirPods get l...,One of the biggest surprise announcements at A...,No,"Apple says that if AirPods are lost or stolen,...",passage,"[[[4, 0], [4, 110]]]"
4,The Reason Why Gabor Kiraly Wears THOSE Tracki...,"June 14th 2016 3.3K Shares, They may look like...",Its a lucky charm to him,"The more good games I had in them, the more I ...",passage,"[[[5, 0], [5, 64]]]"


In [6]:
client = OpenAI(api_key=api_key)

In [7]:
spoil_df["paragraphsLength"] = spoil_df["targetParagraphs"].apply(lambda x: len(x))
mean_length = spoil_df["paragraphsLength"].mean()
median_length = spoil_df["paragraphsLength"].median()

print("Mean length of the paragraph:", mean_length)
print("Median length of the paragraph:", median_length)

Mean length of the paragraph: 2868.0226325193566
Median length of the paragraph: 1973.0


### Direct Prompt

In [166]:
spoil_df[spoil_df["paragraphsLength"] == min(spoil_df["paragraphsLength"])]

Unnamed: 0,targetTitle,targetParagraphs,humanSpoiler,spoiler,tags,spoilerPositions,paragraphsLength
696,The Grand Tour launch date,November 18 The Grand Tour: Launch Date,November 18,November 18,phrase,"[[[0, 0], [0, 11]]]",39


In [174]:
pd.set_option('display.max_colwidth', None)
examples = {}
tags = spoil_df["tags"].unique()

for tag in tags:
    spoiler = spoil_df.loc[(spoil_df["tags"] == tag) & (spoil_df["paragraphsLength"] < 1000)].iloc[0]
    examples[tag] = {
        "tag": spoiler["tags"],
        "spoiler": spoiler["spoiler"],
        "title": spoiler["targetTitle"],
        "paragraph": spoiler["targetParagraphs"],
        "human_spoiler": spoiler["humanSpoiler"],
        "length": spoiler["paragraphsLength"]
    }

print(examples["multi"]["spoiler"])
pd.set_option('display.max_colwidth', 50)

pravastatin, paroxetine


In [217]:
prompt = """
You are a model designed to generate concise spoilers from articles. Your task is to analyze the main question or topic posed by the article’s title and generate a spoiler based on the content provided in the article's parahraphs. Your response must include a JSON object with keys "humanSpoiler", and "spoiler".
Use "targetTitle" to determine the central question or topic the spoiler should address. Review "targetParagraphs" to find the most relevant details that directly answer or the topic stated in the title. Consider provided tag: 
"phrase": Short, direct answer or single phrase.
"passage": More detailed, informative spoiler.
"multi": Multiple sentences or complex information for a full response, provide a longer, structured spoiler.

Answer:{
"humanSpoiler": <A model-generated spoiler>,
"spoiler": <A concise, extracted from the article’s content but remaining logical>
}

Example 1:
Input{ 
"tag":"phrase"
"targetTitle":"Teen Mom 2 Star Jenelle Evans Reveals Sex Of Her Second Child"
"targetParagraphs":"Teen Mom 2 star Jenelle Evans took to Twitter and Instagram Monday (Feb. 3) to announce that she and boyfriend Nathan Griffin are expecting a baby boy., Evans has faced criticism for the pregnancy, largely because she does not currently have custody of her first child, 4-year-old Jace. In the Season 5 premiere of Teen Mom 2, Evans had an abortion, which led to even more speculation about whether or not she is ready to have a second child. The reality TV star has a long history battling drug addiction, and has been arrested several times., Evans told In Touch that she had the abortion at the beginning of Season 5 because she was too skinny and unhealthy to continue with the unexpected pregnancy while in recovery for heroin addiction., Evans is legally married to Courtland Rogers, though the couple is separated. The father of her first son, Jace, is ex-boyfriend Andrew Lewis."}
Output{
"humanSpoiler":"Boy"
"spoiler":"boy"}

Example 2:
Input{
"tag":"passage"
"targetTitle":"You’ll Never Believe What This Family Saw in the Sky Outside Their House in Finland."
"targetParagraphs":"It was cold and very foggy, the temperature was around -10 degrees Celsius, said Hänninen. When the clouds began to break, there were rainbow colours in the sky and a halo spanning 360 degrees! It was worth taking a picture or two. If I ever stepped outside and saw this in my backyard, I might think the aliens were invading!"}
Output{
"humanSpoiler":"Sun Halos Which are caused by ice crystals in cirrostratus clouds."
"spoiler":"rainbow colours in the sky and a halo spanning 360 degrees"}

Example 2:
Input{
"tag":"multi"
"targetTitle":"Mixing These Two Common Medications Could Be Life-Threatening"
"targetParagraphs":"It’s common knowledge that mixing your medication isn’t recommended without a doctor’s permission. However, an estimated 1 million people in the United States are on two drugs that together can cause a seriously adverse reaction., This video from Business Insider recalls a study from Harvard, Stanford, and Vanderbilt University on a specific drug interaction. Their study looked at the interaction between pravastatin – a cholesterol-reducing drug – and paroxetine, a widely used antidepressant known under the brand name of Seroxat. If taken together, they can cause complications that are so severe, they could become life-threatening., Watch the video below to find out why."}
Output{
"humanSpoiler":"Pravastatin and Paroxetine (AKA Seroxat) iflscience.com"
"spoiler":"pravastatin, paroxetine"}
"""

In [179]:
cost_of_one = calculate_cost(prompt=prompt, data="", out="Intellectual stimulation")
print(f"Total cost: ${cost_of_one * 4000}")

Input tokens: 761
Output tokens: 3
Total tokens: 764
Cost: $0.00012
Total cost: $0.46379999999999993


In [42]:
num_samples = 10
passage_sample = spoil_df[spoil_df['tags'] == 'passage'].sample(num_samples, random_state=1)
phrase_sample = spoil_df[spoil_df['tags'] == 'phrase'].sample(num_samples, random_state=1)
multi_sample = spoil_df[spoil_df['tags'] == 'multi'].sample(num_samples, random_state=1)
test_df = pd.concat([passage_sample, phrase_sample, multi_sample]).reset_index(drop=True)
test_df.shape

(30, 7)

### Call to the model. Response saved as file

In [181]:
def create_batch_request_to_open_ai_direct_prompt(
    list_of_prompts: List[str],
    name_of_request_file: str,
    description_of_request: str,   
    temperature: None = 1, #between [0,2]. Default = 1, higher temperature leads to more random, while lower to more deterministic,
    model: str = "gpt-4o-mini"
):

    dictionary = [{
        "custom_id": f"request{i}", 
        "method": "POST", 
        "url": "/v1/chat/completions", 
        "body": {"model": model, 
                 "messages": [
                     {"role": "system", "content": "assistant"},
                     {"role": "user", "content": prompt__}],
                 "max_tokens": 1000,
                 "temperature": temperature}
    } for i, prompt__ in enumerate(list_of_prompts)]
    
    with open(f"../request_files_openai/{name_of_request_file}.jsonl", "w") as outfile:
        for request__ in dictionary:
            json.dump(request__, outfile)
            outfile.write('\n')
    
    
    batch_input_file = client.files.create(
      file=open(fr"../request_files_openai/{name_of_request_file}.jsonl", "rb"),
      purpose="batch"
    )
    
    return client.batches.create(
        input_file_id=batch_input_file.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": description_of_request
        }
    )

In [20]:
def save_direct_prompt_to_file_and_get_predictions(
    result_file_id: str,
    result_file_name: str,
    save_to_file: bool = True
):
    if save_to_file:
        result = client.files.content(result_file_id).content
        result_file_name = f"../request_files_openai/{result_file_name}.jsonl"
        
        with open(result_file_name, 'wb') as file:
            file.write(result)

    pred = []
    with open(result_file_name, 'r') as file:
        i = 0
        for line in file:
            prediction_str = json.loads(line)['response']['body']['choices'][0]['message']['content']
            try:
                pred.append(str(prediction_str))
            except Exception as e:
                print("there was problem with this request. Output:")
                print(prediction_str)
                print()
                pred.append(0)
                i+=1
    print(f"number of errors = {i}")
    return pred

In [183]:
prompt_1 = [
    str(prompt + f'Article to spoil: "tag":{entry["tags"]}. "targetTitle":{entry["targetTitle"]}. "targetParagraphs":{entry["targetParagraphs"]}')
    for _, entry in test_df.iterrows()
]

In [185]:
test_batch = create_batch_request_to_open_ai_direct_prompt(
    list_of_prompts = prompt_1,
    name_of_request_file = "request_spoiling_test_2",
    description_of_request = "request_spoiling_12_11_2024",
)

In [212]:
in_progress_batch = client.batches.retrieve(test_batch.id)
in_progress_batch.output_file_id

'file-VB4AAmjC4hLfPvwsIedldiO7'

In [213]:
file_response = client.files.content(in_progress_batch.output_file_id).content
print(file_response)

b'{"id": "batch_req_67339f1283f8819092af387450a2f4b0", "custom_id": "request0", "response": {"status_code": 200, "request_id": "03d2691eee91c5f2885c0e36ea94ad44", "body": {"id": "chatcmpl-ASpfAq9vnY73RY2rR3EkxopjevpXK", "object": "chat.completion", "created": 1731435140, "model": "gpt-4o-mini-2024-07-18", "choices": [{"index": 0, "message": {"role": "assistant", "content": "{\\n  \\"humanSpoiler\\": \\"30% earnings increase\\",\\n  \\"spoiler\\": \\"nearly 30 percent rise in earnings\\"\\n}", "refusal": null}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 989, "completion_tokens": 29, "total_tokens": 1018, "prompt_tokens_details": {"cached_tokens": 0, "audio_tokens": 0}, "completion_tokens_details": {"reasoning_tokens": 0, "audio_tokens": 0, "accepted_prediction_tokens": 0, "rejected_prediction_tokens": 0}}, "system_fingerprint": "fp_0ba0d124f1"}}, "error": null}\n{"id": "batch_req_67339f129f308190ae79b50a9af948ec", "custom_id": "request1", "response": {"statu

In [214]:
pred = save_direct_prompt_to_file_and_get_predictions(
    result_file_id = in_progress_batch.output_file_id,
    result_file_name = "spoiling_test_2"
)

number of errors = 0


### Spoiling analysis from file

In [22]:
result_file_name = "../request_files_openai/spoiling_test_2.jsonl"
pred = save_direct_prompt_to_file_and_get_predictions(
    result_file_id = None,
    result_file_name = result_file_name,
    save_to_file = False
)

number of errors = 0


In [None]:
def extract_data_regex(pred):
    pattern = r'[{}\n":,]'
    human_spoiler_pattern = r'"humanSpoiler"\s*:\s*"([^"]+)"'
    spoiler_pattern = r'"spoiler"\s*:\s*"([^"]+)"'
    data = []

    for text in pred:
        human_spoiler_value = spoiler_value = None
        human_spoiler_match = re.search(human_spoiler_pattern, text)
        spoiler_match = re.search(spoiler_pattern, text)
        if human_spoiler_match:
            human_spoiler_value = human_spoiler_match.group(1)
        else:
            match = re.search(r'humanSpoiler', text)
            if match:
                start_index = match.start()
                human_spoiler_value = text[start_index+12:] 
        
        if spoiler_match:
            spoiler_value = spoiler_match.group(1)
        else:
            match = re.search(r'spoiler', text)
            if match:
                start_index = match.start()
                spoiler_value = text[start_index+7:] 
        
        
        data.append({
            "humanSpoiler": human_spoiler_value,
            "spoiler": spoiler_value
        })

    df = pd.DataFrame(data)
    return df


In [32]:
def extract_data(pred):
    data = []
    for spoiler in pred:
        match = re.search(r'\{.*?\}', spoiler, re.DOTALL)
        if match:
            result = match.group(0)
            json_spoiler = json.loads(result)
            data.append({
                "humanSpoiler": json_spoiler["humanSpoiler"],
                "spoiler": json_spoiler["spoiler"]
            })
        else:
            print("No match found.")
    return pd.DataFrame(json_data)

In [36]:
extracted_df = extract_data(pred)
na_rows = extracted_df[extracted_df.isna().any(axis=1)]
print("Extracted data size: ", extracted_df.shape)
print("Rows with NaN values: ", na_rows.shape)

Extracted data size:  (30, 2)
Rows with NaN values:  (0, 2)


### Calculating metrics
- BLEU
- BERTScore (BSc)
- METEOR (MET)
- Cosine Similarity

#### Cosine Similarity for spoiler, humanSpoiler

In [53]:
extracted_df.rename(columns={'humanSpoiler': 'humanSpoiler_ext', 'spoiler': 'spoiler_ext'}, inplace=True)
test_df.rename(columns={'humanSpoiler': 'humanSpoiler_org', 'spoiler': 'spoiler_org'}, inplace=True)
cosine_df = pd.concat([test_df[['humanSpoiler_org', 'spoiler_org']], extracted_df[['humanSpoiler_ext', 'spoiler_ext']]], axis=1)

In [65]:
vectorizer = TfidfVectorizer()
def calculate_cosine_similarity_row(row, columns):
    tfidf_matrix = vectorizer.fit_transform([row[columns[0]], row[columns[1]]])
    cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return cosine_sim

In [69]:
cosine_df['humanSpoiler_cos_sim'] = cosine_df.apply(calculate_cosine_similarity_row, axis=1, columns=['humanSpoiler_ext', 'humanSpoiler_org'])
cosine_df['spoiler_cos_sim'] = cosine_df.apply(calculate_cosine_similarity_row, axis=1, columns=['spoiler_ext', 'spoiler_org'])

In [71]:
cosine_df.head()

Unnamed: 0,humanSpoiler_org,spoiler_org,humanSpoiler_ext,spoiler_ext,humanSpoiler_cos_sim,spoiler_cos_sim
0,Nearly 30% rise in earnings sending shares up ...,"nearly 30 percent rise in earnings, sending sh...",30% earnings increase,nearly 30 percent rise in earnings,0.190663,0.6194
1,"On RSVP under entree options, Child (12 and un...","the caption, I\ll have the 10-year-old, please...",A menu mistake on a wedding RSVP card hilariou...,children have apparently been added to the menu,0.105992,0.176728
2,No,But maybe the biggest thing is that if it were...,Rap music has maintained its cultural identity...,Rap music has avoided becoming predominantly w...,0.0,0.406965
3,Hes not going to star in the Freddie Mercury b...,it grows and grows until its Dan Radcliffe is ...,Neither Daniel Radcliffe nor Sacha Baron Cohen...,Sacha Baron Cohen will no longer star in the f...,0.16044,0.105992
4,Its not. Attached here.,Ford looked as good during his carpentry phase...,Harrison Ford looked great as a carpenter befo...,Fans will be delighted to know that Ford looke...,0.0,0.76892
