## Clickbait spoiling notebook

In [50]:
import pandas as pd
import os
from dotenv import load_dotenv
from openai import OpenAI
import tiktoken

In [8]:
# VARIABLES
load_dotenv()
api_key = os.getenv("OPEN_AI_KEY")

In [None]:
def calculate_cost(prompt: str, data: str, out: str, model: str = "gpt-4o-mini") -> float:
    """
    Calculates the cost of a request to a OPEN AI model based on the prompt, data, and output

    Args:
        prompt (str): prompt sent to the LLM
        data (str): additional data sent to the LLM.
        out (str): response received from the LLM.
        model (str): model identifier used to determine the tokenizer, defaults to "gpt-4o-mini"

    Returns:
        float: total cost of the tokens used in USD
    """
    tokenizer = tiktoken.encoding_for_model(model)  
    request = str(prompt) + str(data)
    response = str(out)

    request_tokens = tokenizer.encode(request)
    response_tokens = tokenizer.encode(response)

    input_tokens = len(request_tokens)
    output_tokens = len(response_tokens)

    # costs per 1 million tokens
    cost_per_1M_input_tokens = 0.15  # $0.15 per 1M input tokens
    cost_per_1M_output_tokens = 0.60  # $0.60 per 1M output tokens

    input_cost = (input_tokens / 10**6) * cost_per_1M_input_tokens
    output_cost = (output_tokens / 10**6) * cost_per_1M_output_tokens
    total_cost = input_cost + output_cost

    print(f"Input tokens: {input_tokens}")
    print(f"Output tokens: {output_tokens}")
    print(f"Total tokens: {input_tokens + output_tokens}")
    print(f"Cost: ${total_cost:.5f}")
    return total_cost

In [104]:
spoil_df = pd.read_csv("../data/spoiling_data.csv")
print(spoil_df.shape)
tags = spoil_df["tags"].unique().tolist()
print(tags)

(3358, 6)
['passage', 'phrase', 'multi']


In [None]:
spoil_df.head()

Unnamed: 0,targetTitle,targetParagraphs,humanSpoiler,spoiler,tags,spoilerPositions
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",It’ll be just like old times this weekend for ...,They Threw A Football,how about that morning we go throw?,passage,"[[[3, 151], [3, 186]]]"
1,Hole In Ozone Layer Expected To Make Full Reco...,2070 is shaping up to be a great year for Moth...,2070,2070,phrase,"[[[0, 0], [0, 4]]]"
2,Intellectual Stimulation Trumps Money For Empl...,"Despite common belief, money isnt the key to e...",Intellectual stimulation,intellectual stimulation,phrase,"[[[1, 186], [1, 210]]]"
3,Heres what happens if your Apple AirPods get l...,One of the biggest surprise announcements at A...,No,"Apple says that if AirPods are lost or stolen,...",passage,"[[[4, 0], [4, 110]]]"
4,The Reason Why Gabor Kiraly Wears THOSE Tracki...,"June 14th 2016 3.3K Shares, They may look like...",Its a lucky charm to him,"The more good games I had in them, the more I ...",passage,"[[[5, 0], [5, 64]]]"


In [11]:
client = OpenAI(api_key=api_key)

completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": "Write a haiku about recursion in programming."
        }
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='In code’s endless loop,  \nFunctions call themselves again,  \nDepths of logic reign.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None)


In [107]:
spoil_df["paragraphsLength"] = spoil_df["targetParagraphs"].apply(lambda x: len(x))
mean_length = spoil_df["paragraphsLength"].mean()
median_length = spoil_df["paragraphsLength"].median()

print("Mean length of the paragraph:", mean_length)
print("Median length of the paragraph:", median_length)

Mean length of the paragraph: 2868.0226325193566
Median length of the paragraph: 1973.0


In [61]:
spoil_df[spoil_df["paragraphsLength"] == min(spoil_df["paragraphsLength"])]

Unnamed: 0,targetTitle,targetParagraphs,humanSpoiler,spoiler,tags,spoilerPositions,paragraphsLength
696,The Grand Tour launch date,November 18 The Grand Tour: Launch Date,November 18,November 18,phrase,"[[[0, 0], [0, 11]]]",39


In [137]:
pd.set_option('display.max_colwidth', None)
examples = {}
tags = spoil_df["tags"].unique()

for tag in tags:
    spoiler = spoil_df.loc[(spoil_df["tags"] == tag) & (spoil_df["paragraphsLength"] < 1000)].iloc[0]
    examples[tag] = {
        "tag": spoiler["tags"],
        "title": spoiler["targetTitle"],
        "paragraph": spoiler["targetParagraphs"],
        "human_spoiler": spoiler["humanSpoiler"],
        "length": spoiler["paragraphsLength"]
    }

print(examples["multi"]["human_spoiler"])
pd.set_option('display.max_colwidth', 50)

Pravastatin and Paroxetine (AKA Seroxat) iflscience.com


In [133]:
prompt = """
You are the model which is used to generate the spoiler from the article. Spoiler should show the answer for the question stated in the title or should describe the topic mentioned in the title based on the article body. You have "targetTitle" which is the title of the article and the "targetParagraph" which is the body of the article. The spoilers are tagged by: "phrase", "passage", "multi".
Example 1: 
"tag":"phrase"
"targetTitle":"Taylor Swift Reveals a Favorite Reality TV Show! You Wont Believe Which One It Is"
"targetParagraphs":"Taylor Swift just gave us another reason to love her. The Grammy winner revealed one of her guilty pleasures last night while receiving the Taylor Swift Award (yes, the award is named after her) during the BMI Pop Awards at the Beverly Wilshire Hotel., After praising the night\s BMI Icon Award winners, songwriters Cynthia Weil and Barry Mann, Swift said, I recently have had a lot of time off. I\ve been watching a lot of reality TV., Barry and Cynthia\s daughter Jenn is the couples therapist on a show called Couples Therapy that I watch a lot and I was really really excited to meet her, she said."
"humanSpoiler":"Couples Therapy"

Example 2:
"tag":"passage"
"targetTitle":"You’ll Never Believe What This Family Saw in the Sky Outside Their House in Finland."
"targetParagraphs":"It was cold and very foggy, the temperature was around -10 degrees Celsius, said Hänninen. When the clouds began to break, there were rainbow colours in the sky and a halo spanning 360 degrees! It was worth taking a picture or two. If I ever stepped outside and saw this in my backyard, I might think the aliens were invading!"
"humanSpoiler":"Sun Halos Which are caused by ice crystals in cirrostratus clouds."

Example 2:
"tag":"multi"
"targetTitle":"Mixing These Two Common Medications Could Be Life-Threatening"
"targetParagraphs":"It’s common knowledge that mixing your medication isn’t recommended without a doctor’s permission. However, an estimated 1 million people in the United States are on two drugs that together can cause a seriously adverse reaction., This video from Business Insider recalls a study from Harvard, Stanford, and Vanderbilt University on a specific drug interaction. Their study looked at the interaction between pravastatin – a cholesterol-reducing drug – and paroxetine, a widely used antidepressant known under the brand name of Seroxat. If taken together, they can cause complications that are so severe, they could become life-threatening., Watch the video below to find out why."
"humanSpoiler":"Pravastatin and Paroxetine (AKA Seroxat) iflscience.com"

Article to spoil:
"tag":{}
"targetParagraphs":{}
"targetParagraphs":{}
"""

  prompt = """


In [134]:
cost_of_one = calculate_cost(prompt=prompt, data="", out="Intellectual stimulation")
print(f"Total cost: ${cost_of_one * 4000}")

Input tokens: 595
Output tokens: 3
Total tokens: 598
Cost: $0.00009
Total cost: $0.3642


In [138]:
spoil_df.head()

Unnamed: 0,targetTitle,targetParagraphs,humanSpoiler,spoiler,tags,spoilerPositions,paragraphsLength
0,"Wes Welker Wanted Dinner With Tom Brady, But P...",It’ll be just like old times this weekend for ...,They Threw A Football,how about that morning we go throw?,passage,"[[[3, 151], [3, 186]]]",1663
1,Hole In Ozone Layer Expected To Make Full Reco...,2070 is shaping up to be a great year for Moth...,2070,2070,phrase,"[[[0, 0], [0, 4]]]",2099
2,Intellectual Stimulation Trumps Money For Empl...,"Despite common belief, money isnt the key to e...",Intellectual stimulation,intellectual stimulation,phrase,"[[[1, 186], [1, 210]]]",1773
3,Heres what happens if your Apple AirPods get l...,One of the biggest surprise announcements at A...,No,"Apple says that if AirPods are lost or stolen,...",passage,"[[[4, 0], [4, 110]]]",1086
4,The Reason Why Gabor Kiraly Wears THOSE Tracki...,"June 14th 2016 3.3K Shares, They may look like...",Its a lucky charm to him,"The more good games I had in them, the more I ...",passage,"[[[5, 0], [5, 64]]]",1087
