# Imports and API Connections

In [62]:
# ! pip install openai requests bs4 google-generativeai python-dotenv

In [3]:
from openai import OpenAI
import requests
from bs4 import BeautifulSoup
import os
import pandas as pd

In [4]:
from dotenv import load_dotenv
import google.generativeai as genai

In [None]:
from dotenv import load_dotenv
load_dotenv()

# Use the key from environment
api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

In [None]:
genai.configure(api_key="GEMINI_API_KEY")
model = genai.GenerativeModel("models/gemini-2.0-flash")

# Getting Taxonomy

In [5]:
taxonomy_path = "/Users/daniel.ben-david/Documents/data/Content Taxonomy 3.1.tsv"
taxonomy = pd.read_csv(taxonomy_path, sep="\t")
taxonomy.columns = taxonomy.iloc[0]
taxonomy = taxonomy.drop(taxonomy.index[0]).reset_index(drop=True)
taxonomy_lines = [
        f"{row['Unique ID']}: {row['Name']} (Parent ID: {row['Parent']})"
        for _, row in taxonomy.iterrows()
    ]

In [7]:
taxonomy

Unnamed: 0,Unique ID,Parent,Name,Tier 1,Tier 2,Tier 3,Tier 4,NaN
0,150,150,Attractions,Attractions,,,,
1,151,150,Amusement and Theme Parks,Attractions,Amusement and Theme Parks,,,
2,179,150,Bars & Restaurants,Attractions,Bars & Restaurants,,,
3,181,150,Casinos & Gambling,Attractions,Casinos & Gambling,,,
4,153,150,Historic Site and Landmark Tours,Attractions,Historic Site and Landmark Tours,,,
...,...,...,...,...,...,...,...,...
699,687,685,Role-Playing Video Games,Video Gaming,Video Game Genres,Role-Playing Video Games,,
700,688,685,Simulation Video Games,Video Gaming,Video Game Genres,Simulation Video Games,,
701,689,685,Sports Video Games,Video Gaming,Video Game Genres,Sports Video Games,,
702,690,685,Strategy Video Games,Video Gaming,Video Game Genres,Strategy Video Games,,


# Few-Shot Examples

In [74]:
few_shots_gpt = [
        {
            "role": "user",
            "content": "URL: https://www.bbc.com/news/technology-65091083\n\nContent: The article describes a significant cybersecurity incident involving a global tech company. Hackers gained access to user accounts and internal systems. The company is now working with authorities to address the breach."
        },
        {
            "role": "assistant",
            "content": "Unique ID: 101\nParent ID: 100\nCategory Name: Information Security\nExplanation: The article discusses a cyberattack, which falls directly under information security in the IAB taxonomy."
        },
        {
            "role": "user",
            "content": "URL: https://www.espn.com/nba/story/_/id/34567890/lebron-james-leads-lakers-comeback\n\nContent: LeBron James led the Lakers to a comeback win over the Warriors in a critical playoff push. The game was marked by outstanding performances and fan engagement."
        },
        {
            "role": "assistant",
            "content": "Unique ID: 202\nParent ID: 200\nCategory Name: Basketball\nExplanation: The article covers a basketball game and a player's performance, which aligns with the Basketball category."
        }
    ]

In [75]:
few_shots_url_gemini = """
                    Example 1:
                    URL: https://www.bbc.com/news/technology-65091083

                    Content: The article describes a significant cybersecurity incident involving a global tech company. Hackers gained access to user accounts and internal systems. The company is now working with authorities to address the breach.

                    Answer:
                    Unique ID: 101
                    Parent ID: 100
                    Category Name: Information Security
                    Explanation: The article discusses a cyberattack, which falls under Information Security in the IAB taxonomy.

                    ---

                    Example 2:
                    URL: https://www.espn.com/nba/story/_/id/34567890/lebron-james-leads-lakers-comeback

                    Content: LeBron James led the Lakers to a comeback win over the Warriors in a critical playoff push. The game was marked by outstanding performances and fan engagement.

                    Answer:
                    Unique ID: 202
                    Parent ID: 200
                    Category Name: Basketball
                    Explanation: This article is about an NBA basketball game, relevant to the Basketball category.
                    """

In [77]:
few_shots_content_title_gemini = """
                                Example 1:
                                Title: The Walking Dead

                                Answer:
                                Unique ID: 235
                                Parent ID: 200
                                Category Name: Horror and Suspense
                                Explanation: The Walking Dead is a TV show set in a post-apocalyptic world with zombies, which fits the horror and suspense category.

                                ---

                                Example 2:
                                Title: Top Gear

                                Answer:
                                Unique ID: 110
                                Parent ID: 100
                                Category Name: Automotive
                                Explanation: Top Gear is a car-focused TV show, placing it under Automotive content.

                                ---

                                Example 3:
                                Title: The Daily Show

                                Answer:
                                Unique ID: 304
                                Parent ID: 300
                                Category Name: Politics
                                Explanation: The Daily Show is a satirical news show discussing political and current events, relevant to Politics.
                                """

# Functions

In [51]:
def fetch_article_text(url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")

        # Try to extract meaningful content
        paragraphs = soup.find_all("p")
        text = "\n".join(p.get_text() for p in paragraphs)
        return text.strip()
    except Exception as e:
        print(f"Error fetching article: {e}")
        return None

In [52]:
def classify_with_gpt(url,taxonomy_str):
    article_text = fetch_article_text(url)
    if not article_text:
        return "Could not retrieve article content."

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        *FEW_SHOTS,
        {"role": "user", "content": f"URL: {url}\n\nContent:\n{article_text[:2000]}"}  # Trim to stay within context length
    ]

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages,
        temperature=0.1
    )
    return response.choices[0].message["content"]
# result = classify_with_gpt(test_url, taxonomy_lines)

In [102]:
def classify_url_with_gemini(url, taxonomy_lines):
    article_text = fetch_article_text(url)
    if not article_text:
        return "Could not retrieve article content."
    SYSTEM_PROMPT = f""" You are a content classification expert using the IAB Content Taxonomy 3.1.
                Given the content of a web page, your task is to analyze the content and assign the most relevant category using the taxonomy below.
                Respond with:
                1. Unique ID
                2. Parent ID
                3. Category Name
                4. A short explanation of why this page is associated with the chosen category.

                Only use the categories from the list below:
                {taxonomy_lines[:704]}
                ---
                {few_shots_url_gemini}
                Now classify the following article:

                URL: {url}

                Content:
                {article_text[:3000]}
                """

    response = model.generate_content(SYSTEM_PROMPT)
    return response.text


In [79]:
def classify_content_title_with_gemini(title, taxonomy_lines):
    SYSTEM_PROMPT = f"""
                You are a content classification expert using the IAB Content Taxonomy 3.1.

                Your task is to classify a piece of media content based **only** on its title (e.g., a movie or TV show name). You should:

                1. Identify the **likely subject matter** and **genre** of the title based on cultural and entertainment knowledge.
                2. Use your understanding of popular TV shows, films, and trends.
                3. Assume the title refers to a well-known show/movie unless stated otherwise.
                4. If the title is obscure or ambiguous, classifiy it as unknown. DO NOT ASSING IDs in the response if the likelihood is very low. Please provide in such cases the explantion inside the proper filed.
                5. Use the IAB Content Taxonomy to assign the **most appropriate category**.

                Respond in this format:

                Unique ID: <from taxonomy>
                Parent ID: <from taxonomy>
                Category Name: <exact name from taxonomy>
                Explanation: <why this category fits based on the title alone>

                Only use the categories from the list below:
                {taxonomy_lines[:704]}
                ---
                {few_shots_content_title_gemini}
                Now classify the following title:

                Title: {title}
                """

    response = model.generate_content(SYSTEM_PROMPT)
    return response.text


# URL

In [103]:
url = "https://play.google.com/store/apps/details?id=tv.atmosphere.atmospheretv&hl=en_US"

In [104]:
result = classify_url_with_gemini(url, taxonomy_lines)

In [105]:
print(result)

Here's the classification of the provided content based on the IAB Content Taxonomy 3.1:

1.  Unique ID: 602
2.  Parent ID: 599
3.  Category Name: Software and Applications
4.  Explanation: The content is a listing of information, terms, and services for Google Play, which distributes software applications.



# TITLE

In [95]:
title = "Donnie Darko"

In [96]:
result = classify_content_title_with_gemini(title, taxonomy_lines)

In [97]:
print(result)

Unique ID: 331
Parent ID: SPSHQ5
Category Name: Mystery
Explanation: "Donnie Darko" is a film known for its complex and enigmatic plot, involving time travel and philosophical questions, which aligns with the Mystery genre.

