In [None]:
import os

import google.generativeai as genai
import pandas as pd
from dotenv import load_dotenv

pd.set_option("display.max_colwidth", None)

load_dotenv()  # API key is stored in .env file

SEED = int(os.getenv("SEED", "1337"))
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

genai.configure(api_key=GOOGLE_API_KEY)

# We are using amazon products dataset instead of newsgroup
data = pd.read_csv("../data/amazon_products.csv", usecols=["asin", "title", "category_id"]).rename(
    columns={"title": "Text"}
)
categories = pd.read_csv("../data/amazon_categories.csv").rename(columns={"id": "category_id"})
data = data.merge(categories, on="category_id", how="left")
data

In [None]:
train = data.groupby("category_id").sample(n=2, random_state=SEED).reset_index(drop=True)
valid = (
    data[~data.asin.isin(train.asin.values)]
    .groupby("category_id")
    .sample(n=1, random_state=SEED)
    .reset_index(drop=True)
)
assert train.category_id.nunique() == valid.category_id.nunique()

id2label = {id_: label for label, id_ in enumerate(sorted(train.category_id.unique()))}
label2id = {label: id_ for label, id_ in enumerate(sorted(train.category_id.unique()))}

train["Label"] = train.category_id.map(id2label)
valid["Label"] = valid.category_id.map(id2label)
train

In [None]:
idx = 100
sample_row = train.loc[idx, "Text"]
label = train.loc[idx, "Label"]
category_name = train.loc[idx, "category_name"]

sample_row, label, category_name

In [None]:
baseline_model = genai.GenerativeModel("gemini-1.5-flash-001")
response = baseline_model.generate_content(sample_row)
print(response.text)

In [None]:
prompt = "Using amazom product taxonomy information, which category this product belongs to:"

response = baseline_model.generate_content([prompt, sample_row])
print(response.text)

In [None]:
from google.api_core import retry

# You can use a system instruction to do more direct prompting, and get a
# more succinct answer.

system_instruct = """
You are a classification service. You will be passed input that represents
an amazon product title and you must respond with the amazon taxonomy (deep category name) leaf.
"""

instructed_model = genai.GenerativeModel("gemini-1.5-flash-001", system_instruction=system_instruct)

retry_policy = {"retry": retry.Retry(predicate=retry.if_transient_error)}


# If you want to evaluate your own technique, replace this function with your
# model, prompt and other code and return the predicted answer.
def predict_label(post: str) -> str:
    response = instructed_model.generate_content(post, request_options=retry_policy)
    rc = response.candidates[0]

    # Any errors, filters, recitation, etc we can mark as a general error
    if rc.finish_reason.name != "STOP":
        return "(error)"
    else:
        # Clean up the response.
        return response.text.strip()


prediction = predict_label(sample_row)

print(prediction)
print()
print("Correct!" if prediction == category_name else "Incorrect.")

In [None]:
from tqdm.rich import tqdm

tqdm.pandas()

# Make predictions using the sampled data.
valid["Prediction"] = valid["Text"].progress_apply(predict_label)

# And calculate the accuracy.
accuracy = (valid["category_name"] == valid["Prediction"]).sum() / len(valid)
print(f"Accuracy: {accuracy:.2%}")

In [None]:
valid[valid.category_name == valid.Prediction]

In [None]:
import random

model_id = f"amazon-product-classification-{random.randint(10000,9999999)}"
tuning_op = genai.create_tuned_model(
    "models/gemini-1.5-flash-001-tuning",
    training_data=train,
    input_key="Text",
    output_key="category_name",
    id=model_id,
    display_name="Amazon Product Classification",
    batch_size=16,
    epoch_count=2,
)
print(model_id)

In [None]:
import time

import seaborn as sns

while (tuned_model := genai.get_tuned_model(f"tunedModels/{model_id}")).state.name != "ACTIVE":
    print(tuned_model.state)
    time.sleep(60)

print(f"Done! The model name is {tuned_model.state.name}")
snapshots = pd.DataFrame(tuned_model.tuning_task.snapshots)
sns.lineplot(data=snapshots, x="step", y="mean_loss")

In [None]:
your_model = genai.GenerativeModel(f"tunedModels/{model_id}")

idx = 10
valid_sample = valid.loc[idx, "Text"]
label = valid.loc[idx, "Label"]
category_name = valid.loc[idx, "category_name"]

print(valid_sample)
print(category_name)

response = your_model.generate_content(valid_sample)
print(response.text)

In [None]:
def classify_text(text: str) -> str:
    """Classify the provided text into a known newsgroup."""
    response = your_model.generate_content(text, request_options=retry_policy)
    rc = response.candidates[0]

    # Any errors, filters, recitation, etc we can mark as a general error
    if rc.finish_reason.name != "STOP":
        return "(error)"
    else:
        return rc.content.parts[0].text


valid["Prediction_fine_tuned"] = valid["Text"].progress_apply(classify_text)

accuracy = (valid["category_name"] == valid["Prediction_fine_tuned"]).sum() / len(valid)
print(f"Accuracy: {accuracy:.2%}")

In [None]:
# Calculate the *input* cost of the baseline model with system instructions.
sysint_tokens = instructed_model.count_tokens(sample_row).total_tokens
print(f"System instructed baseline model: {sysint_tokens} (input)")

# Calculate the input cost of the tuned model.
tuned_tokens = your_model.count_tokens(sample_row).total_tokens
print(f"Tuned model: {tuned_tokens} (input)")

savings = (sysint_tokens - tuned_tokens) / tuned_tokens
print(f"Token savings: {savings:.2%}")  # Note that this is only n=1.

In [None]:
baseline_token_output = response.usage_metadata.candidates_token_count
print("Baseline (verbose) output tokens:", baseline_token_output)

tuned_model_output = your_model.generate_content(sample_row)
tuned_tokens_output = tuned_model_output.usage_metadata.candidates_token_count
print("Tuned output tokens:", tuned_tokens_output)
