In [None]:
import os

import google.generativeai as genai
import pandas as pd
from dotenv import load_dotenv

pd.set_option("display.max_colwidth", None)

load_dotenv()  # API key is stored in .env file

GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=GOOGLE_API_KEY)

data = pd.read_csv("../data/amazon_products.csv", usecols=["asin", "title", "category_id"])
data.head(5)

In [None]:
train = data.groupby("category_id").sample(n=5)
valid = data[~data.asin.isin(train.asin.values)].groupby("category_id").sample(n=1)
train.category_id.nunique() == valid.category_id.nunique()

In [33]:
from google.api_core import retry
from tqdm.rich import tqdm

tqdm.pandas()


@retry.Retry(timeout=300.0)
def embed_fn(text: str) -> list[float]:
    # You will be performing classification, so set task_type accordingly.
    response = genai.embed_content(model="models/text-embedding-004", content=text, task_type="classification")

    return response["embedding"]


def create_embeddings(df):
    df["Embeddings"] = df["title"].progress_apply(embed_fn)
    return df


In [None]:
train = create_embeddings(train)
valid = create_embeddings(valid)

In [43]:
id2label = {id_: label for label, id_ in enumerate(train.category_id.unique())}
label2id = {label: id_ for label, id_ in enumerate(train.category_id.unique())}

In [None]:
train["label"] = train.category_id.map(id2label)
valid["label"] = valid.category_id.map(id2label)

In [46]:
import keras
from keras import layers


def build_classification_model(input_size: int, num_classes: int) -> keras.Model:
    return keras.Sequential(
        [
            layers.Input([input_size], name="embedding_inputs"),
            layers.Dense(input_size, activation="relu", name="hidden"),
            layers.Dense(num_classes, activation="softmax", name="output_probs"),
        ]
    )

In [None]:
# Derive the embedding size from observing the data. The embedding size can also be specified
# with the `output_dimensionality` parameter to `embed_content` if you need to reduce it.
embedding_size = len(train["Embeddings"].iloc[0])

classifier = build_classification_model(embedding_size, len(train["label"].unique()))
classifier.summary()

classifier.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(),
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"],
)


In [None]:
import numpy as np

NUM_EPOCHS = 20
BATCH_SIZE = 32

# Split the x and y components of the train and validation subsets.
y_train = train["label"]
x_train = np.stack(train["Embeddings"])
y_val = valid["label"]
x_val = np.stack(valid["Embeddings"])

# Specify that it's OK to stop early if accuracy stabilises.
early_stop = keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

# Train the model for the desired number of epochs.
history = classifier.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_val, y_val),
    callbacks=[early_stop],
    batch_size=BATCH_SIZE,
    epochs=NUM_EPOCHS,
)

In [None]:
classifier.evaluate(x=x_val, y=y_val, return_dict=True)