In [None]:
import os

In [None]:
from pydantic import BaseModel
import time
from typing import Literal
from itertools import batched
from rich import print
from rich.progress import track
from huggingface_hub import InferenceClient
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
token = os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [3]:

client = InferenceClient(token=token)

In [4]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/palewire/first-llm-classifier/refs/heads/main/_notebooks/Form460ScheduleESubItem.csv"
)

In [5]:
class PayeeList(BaseModel):
    answers: list[Literal["Restaurant", "Bar", "Hotel", "Other"]]

In [6]:
def classify_payees(name_list):
    prompt = """
You are an AI model trained to categorize businesses based on their names.

You will be given a list of business names, each separated by a new line.

Your task is to analyze each name and classify it into one of the following categories: Restaurant, Bar, Hotel, or Other.

If a business does not clearly fall into Restaurant, Bar, or Hotel categories, you should classify it as "Other".

Even if the type of business is not immediately clear from the name, it is essential that you provide your best guess based on the information available to you. If you can't make a good guess, classify it as Other.

For example, if given the following input:

"Intercontinental Hotel\nPizza Hut\nCheers\nWelsh's Family Restaurant\nKTLA\nDirect Mailing"

Your output should be a JSON object in the following format:

{"answers": ["Hotel", "Restaurant", "Bar", "Restaurant", "Other", "Other"]}

This means that you have classified "Intercontinental Hotel" as a Hotel, "Pizza Hut" as a Restaurant, "Cheers" as a Bar, "Welsh's Family Restaurant" as a Restaurant, and both "KTLA" and "Direct Mailing" as Other.
"""

    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": prompt,
            },
            {
                "role": "user",
                "content": "Intercontinental Hotel\nPizza Hut\nCheers\nWelsh's Family Restaurant\nKTLA\nDirect Mailing",
            },
            {
                "role": "assistant",
                "content": '{"answers": ["Hotel", "Restaurant", "Bar", "Restaurant", "Other", "Other"]}',
            },
            {
                "role": "user",
                "content": "Subway Sandwiches\nRuth Chris Steakhouse\nPolitical Consulting Co\nThe Lamb's Club",
            },
            {
                "role": "assistant",
                "content": '{"answers": ["Restaurant", "Restaurant", "Other", "Bar"]}',
            },
            {
                "role": "user",
                "content": "\n".join(name_list),
            },
        ],
        model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
        response_format={
            "type": "json_schema",
            "json_schema": {
                "name": "PayeeList",
                "schema": PayeeList.model_json_schema()
            }
        },
        temperature=0,
    )

    result = PayeeList.model_validate_json(response.choices[0].message.content)
    return dict(zip(name_list, result.answers))

In [7]:
sample_list = list(df.sample(10).payee)

In [8]:
classify_payees(sample_list)

{'THE FARM': 'Restaurant',
 'SOUTWEST AIRLINES': 'Other',
 'CULINARY SPECIALTIES': 'Other',
 'MARRIOTT HOTEL SAN DIEGO MARINA': 'Hotel',
 'CHOPS STEAKHOUSE AND SEAFOOD': 'Restaurant',
 'PRELUDE KITCHEN & BAR': 'Bar',
 'JOANN WILSON': 'Other',
 'COMFORT SUITES': 'Hotel',
 "BALLY'S HOTEL": 'Hotel',
 'DOORDASH, INC.': 'Other'}

In [60]:
def classify_batches(name_list, batch_size=10, wait=1):
    """Split the provided list of names into batches and classify with our LLM them one by one."""
    # Create a place to store the results
    all_results = {}

    # Create an list that will split the name_list into batches
    batch_list = list(batched(list(name_list), batch_size))

    # Loop through the list in batches
    for batch in track(batch_list, description="Classifying batches..."):
        # Classify it with the LLM
        batch_results = classify_payees(list(batch))

        # Verify that we got back the same number of results as we sent in
        try:
            assert len(batch_results) == len(batch)
        except AssertionError:
            raise AssertionError(f"Expected {len(batch)} results but got back {len(batch_results)}.")

        # Add what we get back to the results
        all_results.update(batch_results)

        # Tap the brakes to avoid overloading Hugging Face's API
        time.sleep(wait)

    # Return the results
    return pd.DataFrame(all_results.items(), columns=["payee", "category"])

In [50]:
bigger_sample = list(df.sample(100).payee)

In [51]:
results_df = classify_batches(bigger_sample)

Output()

In [29]:
print(results_df.sample(10).reset_index(drop=True).to_markdown())

In [34]:
print(df.sample(10).reset_index().to_markdown())

In [35]:
sample_df = pd.read_csv(
    "https://raw.githubusercontent.com/palewire/first-llm-classifier/refs/heads/main/_notebooks/sample.csv"
)

In [38]:
training_input, test_input, training_output, test_output = train_test_split(
    sample_df[["payee"]],
    sample_df["category"],
    test_size=0.67,
    random_state=42,  # Remember Jackie Robinson. Remember Douglas Adams.
)

In [57]:
llm_df = classify_batches(list(test_input.payee))

Output()

In [58]:
print(classification_report(test_output, llm_df.category))