# Token Usage Estimator for Review Analysis

This notebook estimates the token usage and cost for analyzing a large dataset of 50,000 comments by:
1. Processing a small sample of comments using different OpenAI models
2. Tracking token usage for both input and output
3. Calculating average tokens per comment
4. Projecting the total cost for analyzing 50,000 comments

This approach allows for accurate cost planning without processing the entire dataset.

In [1]:
# Import required libraries
import pandas as pd
import duckdb
import json
import time
import os
import openai
import tiktoken
from typing import Optional, Tuple, Dict, List, Any
import matplotlib.pyplot as plt
import seaborn as sns

## Token Counting Utilities

The following functions will help us count tokens and estimate costs for different OpenAI models.

In [2]:
def count_tokens(text: str, model: str = "gpt-4") -> int:
    """Count the number of tokens in a text string."""
    try:
        # Try direct model mapping first
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        # For models not directly supported, use cl100k_base encoding (used by gpt-4 and newer models)
        print(f"Model {model} not directly supported by tiktoken, using cl100k_base encoding instead")
        encoding = tiktoken.get_encoding("cl100k_base")
    
    return len(encoding.encode(text))

def estimate_cost(input_tokens: int, output_tokens: int, model: str) -> float:
    """Estimate the cost based on input and output tokens for different models."""
    # Pricing per 1K tokens (as of July 2025)
    model_pricing = {
        "gpt-4o": {"input": 0.005, "output": 0.015},
        "gpt-4-turbo": {"input": 0.01, "output": 0.03},
        "gpt-4": {"input": 0.03, "output": 0.06},
        "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
        "gpt-4-mini": {"input": 0.0015, "output": 0.0060},
        "gpt-4.1-nano": {"input": 0.0015, "output": 0.0060},
    }
    
    # Default to gpt-4 pricing if model not found
    pricing = model_pricing.get(model, model_pricing["gpt-4"])
    
    cost = (input_tokens / 1000 * pricing["input"]) + (output_tokens / 1000 * pricing["output"])
    return cost

def ai_analyze_comments_with_tokens(client, prompt: str, df: pd.DataFrame, model: str, debug: bool = True) -> Tuple[str, Dict[str, Any]]:
    """
    Enhanced version of ai_analyze_comments that also tracks token usage.
    
    Returns:
        Tuple containing the model's response and a dictionary with token counts and estimated cost
    """
    df_json = df.to_json(orient="records")
    
    if debug:
        print(f"Processing {len(df)} comments with model {model}")
    
    messages = [
        {"role": "system", "content": (
            "You are an expert linguistic analyst specializing in extracting and scoring themes from customer return comments. "
            "You always return your output as a single JSON array of objects, one per input record, using exactly the keys and structure specified in the user's instructions. "
            "Do not include any explanations, extra text, or formatting outside the required JSON array. "
            "Be precise, consistent, and strictly follow the output schema and scoring rules provided."
        )},
        {"role": "user", "content": prompt},
        {"role": "user", "content": df_json}
    ]
    
    # Count input tokens
    input_text = "".join([msg["content"] for msg in messages])
    input_tokens = count_tokens(input_text, model)
    
    # Make API call
    start_time = time.time()
    resp = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0.1,
    )
    elapsed_time = time.time() - start_time
    
    content = resp.choices[0].message.content.strip()
    
    # Count output tokens
    output_tokens = count_tokens(content, model)
    
    # Calculate usage stats
    usage_stats = {
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": input_tokens + output_tokens,
        "cost": estimate_cost(input_tokens, output_tokens, model),
        "time": elapsed_time,
        "tokens_per_comment": (input_tokens + output_tokens) / len(df)
    }
    
    if debug:
        print(f"Token usage stats:")
        print(f"  Input tokens: {input_tokens:,}")
        print(f"  Output tokens: {output_tokens:,}")
        print(f"  Total tokens: {input_tokens + output_tokens:,}")
        print(f"  Cost: ${usage_stats['cost']:.4f}")
        print(f"  Time: {elapsed_time:.2f} seconds")
        print(f"  Tokens per comment: {usage_stats['tokens_per_comment']:.1f}")
    
    return content, usage_stats

## 1. Fetch a Small Sample of Comments

We'll use the existing functions to load data from the database and fetch a sample of comments.

In [7]:
# Database connection and import functions
def fetch_return_comments(con, tname, is_sample=True, sample_size=100, comment_only=True) -> pd.DataFrame:
    """
    Fetch return comments from the DuckDB table.
    If is_sample is True, fetch a sample of 'sample_size' rows.
    If comment_only is True, only returns the RETURN_COMMENT column.
    If comment_only is False, returns all columns.
    """
    if is_sample: 
        sample_query = f"ORDER BY RANDOM() LIMIT {sample_size}"
    else:
        sample_query = ""
        
    if comment_only:
        query = f"""
        SELECT "RETURN COMMENT"
        FROM {tname}
        {sample_query}
        """
    else:
        query = f"""
        SELECT *
        FROM {tname}
        {sample_query}
        """
    
    return con.execute(query).df()

# Connect to the database
db_path = 'return_coomment_group'
tname = 'staging_table'

# If database doesn't exist yet, import the data
file_exists = os.path.exists(db_path)
if not file_exists:
    file_path = r'data\RETURN_COMMENTS_GROUP.xlsx'
    print(f"Database not found. Importing data from {file_path}...")
    
    # Define import function
    def import_data(fname, clear=True, db_path='temp_db', tname='staging_table', ftype=None):
        import os
        con = duckdb.connect(db_path)
        if ftype is None:
            ext = os.path.splitext(fname)[1].lower()
            if ext == '.csv':
                ftype = 'csv'
            elif ext == '.parquet':
                ftype = 'parquet'
            elif ext in ('.xlsx', '.xls'):
                ftype = 'excel'
            else:
                con.close()
                raise ValueError("Unsupported file extension.")
        if ftype == 'excel':
            df = pd.read_excel(fname)
            if clear:
                con.execute(f"DROP TABLE IF EXISTS {tname}")
            con.register('temp_excel_df', df)
            con.execute(f"CREATE TABLE {tname} AS SELECT * FROM temp_excel_df")
            con.unregister('temp_excel_df')
        return con
    
    con = import_data(file_path, db_path=db_path, tname=tname, clear=True)
    print(f"Import completed.")
else:
    con = duckdb.connect(db_path)
    print(f"Connected to existing database at {db_path}")

# Fetch a small sample of comments for testing (change sample_size as needed)
con = duckdb.connect(db_path)
sample_size = 200
df_sample = fetch_return_comments(con, tname, is_sample=True, sample_size=sample_size, comment_only=True)
print(f"Fetched {len(df_sample)} sample comments")
df_sample.head()

Connected to existing database at return_coomment_group
Fetched 200 sample comments


Unnamed: 0,RETURN COMMENT
0,I loved the dress and the fit I just dont have...
1,Tight
2,The straps are too long on this and the medium.
3,Irritated my skin
4,Its not a good fit for me and the material was...


## 2. Set up OpenAI Client

We need to set up the OpenAI client with an API key. You can use the existing functions to access the API key from Google Secret Manager or set it directly.

In [4]:
# Set up OpenAI client (choose one of the options below)

# Option 1: Use the access_secret function (uncomment if using Google Secret Manager)
try:
    from google.cloud import secretmanager
    def access_secret(secret_path):
        """Establishes connection to GCP secret manager and retrieves secret value."""
        client = secretmanager.SecretManagerServiceClient()
        response = client.access_secret_version(name=secret_path)
        secret_payload = response.payload.data.decode("UTF-8")
        return secret_payload
    
    # Replace with your secret path
    secret_path = "projects/572292574132/secrets/openai_monday_status_alerts/versions/latest"
    api_key = access_secret(secret_path)
    print("API key retrieved from Google Secret Manager")
except Exception as e:
    print(f"Could not retrieve API key from Google Secret Manager: {e}")
    # Fall back to environment variable
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        api_key = input("Enter your OpenAI API key: ")

# Option 2: Use environment variable (uncomment if not using Google Secret Manager)
# api_key = os.getenv("OPENAI_API_KEY")
# if not api_key:
#     api_key = input("Enter your OpenAI API key: ")

# Initialize the OpenAI client
client = openai.OpenAI(api_key=api_key)

# Load prompts
prompt_paths = {
    'customer': 'prompts/customer_sentiment_prompt.txt',
    'product': 'prompts/product_prompt.txt',
    'function': 'v1/function_calling_prompt.txt'
}

prompts = {}
for key, path in prompt_paths.items():
    try:
        with open(path, 'r', encoding='utf-8') as f:
            prompts[key] = f.read()
        print(f"Loaded {key} prompt from {path}")
    except Exception as e:
        print(f"Could not load {key} prompt from {path}: {e}")
        prompts[key] = None

# Choose which prompt to use for the test
active_prompt_key = 'customer'  # Change to 'customer' or 'product' as needed
active_prompt = prompts[active_prompt_key]

print(f"Using '{active_prompt_key}' prompt")

API key retrieved from Google Secret Manager
Loaded customer prompt from prompts/customer_sentiment_prompt.txt
Loaded product prompt from prompts/product_prompt.txt
Could not load function prompt from v1/function_calling_prompt.txt: [Errno 2] No such file or directory: 'v1/function_calling_prompt.txt'
Using 'customer' prompt


## 3. Run Sentiment Analysis on Sample with Token Tracking

Now we'll run the sentiment analysis on our sample comments and track token usage for different models.

In [8]:
# Define models to test
models_to_test = [
    "gpt-3.5-turbo",     # Most affordable option
    "gpt-4o",            # Best performance/price ratio
    "gpt-4-turbo",        # High-quality results
    "gpt-4.1-nano"
    # Add other models you have access to
]

# Run analysis on each model and collect usage stats
model_results = {}

for model in models_to_test:
    print(f"\n==== Testing model: {model} ====")
    try:
        # Run analysis and track token usage
        response_str, usage_stats = ai_analyze_comments_with_tokens(
            client, 
            active_prompt, 
            df_sample, 
            model=model, 
            debug=True
        )
        
        # Parse response
        try:
            results = json.loads(response_str)
            if isinstance(results, dict):
                results = [results]
            
            # Store results and usage stats
            model_results[model] = {
                "response": results,
                "usage_stats": usage_stats
            }
            
            print(f"Successfully processed {len(results)} comments with {model}")
        except json.JSONDecodeError:
            print(f"Error parsing JSON response from {model}")
            print(f"Raw response: {response_str[:500]}...")
            model_results[model] = {
                "response": None,
                "usage_stats": usage_stats
            }
    except Exception as e:
        print(f"Error processing with {model}: {e}")
        model_results[model] = {
            "response": None,
            "usage_stats": {"error": str(e)}
        }

# Display a summary of all model results
print("\n==== Model Comparison Summary ====")
summary_data = []

for model, data in model_results.items():
    if "usage_stats" in data and "error" not in data["usage_stats"]:
        stats = data["usage_stats"]
        summary_data.append({
            "Model": model,
            "Input Tokens": stats["input_tokens"],
            "Output Tokens": stats["output_tokens"],
            "Total Tokens": stats["total_tokens"],
            "Tokens/Comment": stats["tokens_per_comment"],
            "Time (sec)": stats["time"],
            "Cost ($)": stats["cost"]
        })

summary_df = pd.DataFrame(summary_data)
summary_df


==== Testing model: gpt-3.5-turbo ====
Processing 200 comments with model gpt-3.5-turbo
Token usage stats:
  Input tokens: 3,823
  Output tokens: 4,095
  Total tokens: 7,918
  Cost: $0.0081
  Time: 35.87 seconds
  Tokens per comment: 39.6
Error parsing JSON response from gpt-3.5-turbo
Raw response: [
    {
        "IDENTIFIER": "C1",
        "RETURN_NO": "1",
        "RETURN_COMMENT": "I loved the dress and the fit I just dont have anywhere to where it",
        "Theme 1": "Loved the dress",
        "Sentiment 1": 4,
        "Theme 2": "Good fit",
        "Sentiment 2": 4,
        "Theme 3": "",
        "Sentiment 3": 0,
        "Theme 4": "",
        "Sentiment 4": 0,
        "Pos_mean": 4.0,
        "Neg_mean": 0.0,
        "Total_sentiment": 4.0
    },
    {
        "IDENTIFIER": "C2",
...

==== Testing model: gpt-4o ====
Processing 200 comments with model gpt-4o
Token usage stats:
  Input tokens: 3,791
  Output tokens: 16,384
  Total tokens: 20,175
  Cost: $0.2647
  Time: 318.94 s

Unnamed: 0,Model,Input Tokens,Output Tokens,Total Tokens,Tokens/Comment,Time (sec),Cost ($)
0,gpt-3.5-turbo,3823,4095,7918,39.59,35.866342,0.008054
1,gpt-4o,3791,16384,20175,100.875,318.940446,0.264715
2,gpt-4-turbo,3823,1403,5226,26.13,46.814677,0.08032
3,gpt-4.1-nano,3823,18671,22494,112.47,157.78433,0.117761


## 4. Estimate Cost for 50,000 Comments

Now we'll use our sample results to estimate the cost of processing 50,000 comments with each model.

In [9]:
# Calculate projections for 50,000 comments
target_comment_count = 50000
projection_data = []

for model, data in model_results.items():
    if "usage_stats" in data and "error" not in data["usage_stats"]:
        stats = data["usage_stats"]
        
        # Calculate projected token usage
        proj_input_tokens = stats["input_tokens"] / len(df_sample) * target_comment_count
        proj_output_tokens = stats["output_tokens"] / len(df_sample) * target_comment_count
        proj_total_tokens = proj_input_tokens + proj_output_tokens
        
        # Calculate projected cost
        proj_cost = estimate_cost(proj_input_tokens, proj_output_tokens, model)
        
        # Calculate projected time (rough estimate)
        proj_time_seconds = stats["time"] / len(df_sample) * target_comment_count
        proj_time_minutes = proj_time_seconds / 60
        proj_time_hours = proj_time_minutes / 60
        
        # Add to projection data
        projection_data.append({
            "Model": model,
            "Projected Input Tokens": int(proj_input_tokens),
            "Projected Output Tokens": int(proj_output_tokens),
            "Projected Total Tokens": int(proj_total_tokens),
            "Projected Cost ($)": proj_cost,
            "Projected Time (min)": proj_time_minutes,
            "Projected Time (hours)": proj_time_hours
        })

# Create projection DataFrame
projection_df = pd.DataFrame(projection_data)
projection_df

Unnamed: 0,Model,Projected Input Tokens,Projected Output Tokens,Projected Total Tokens,Projected Cost ($),Projected Time (min),Projected Time (hours)
0,gpt-3.5-turbo,955749,1023750,1979500,2.0135,149.443092,2.490718
1,gpt-4o,947749,4096000,5043750,66.17875,1328.918525,22.148642
2,gpt-4-turbo,955749,350750,1306500,20.08,195.061153,3.251019
3,gpt-4.1-nano,955749,4667750,5623500,29.440125,657.43471,10.957245


In [1]:
# Visualize cost projections
plt.figure(figsize=(12, 6))

# Cost comparison
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='Projected Cost ($)', data=projection_df)
plt.title(f'Projected Cost for {target_comment_count:,} Comments')
plt.xticks(rotation=45)
plt.tight_layout()

# Token usage comparison
plt.subplot(1, 2, 2)
token_data = []
for _, row in projection_df.iterrows():
    token_data.append({
        'Model': row['Model'],
        'Tokens': row['Projected Input Tokens'],
        'Type': 'Input'
    })
    token_data.append({
        'Model': row['Model'],
        'Tokens': row['Projected Output Tokens'],
        'Type': 'Output'
    })

token_df = pd.DataFrame(token_data)
sns.barplot(x='Model', y='Tokens', hue='Type', data=token_df)
plt.title(f'Projected Token Usage for {target_comment_count:,} Comments')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

# Create a formatted summary table
summary_table = projection_df.copy()
summary_table['Projected Cost ($)'] = summary_table['Projected Cost ($)'].apply(lambda x: f"${x:,.2f}")
summary_table['Projected Total Tokens'] = summary_table['Projected Total Tokens'].apply(lambda x: f"{x:,}")
summary_table['Projected Time (hours)'] = summary_table['Projected Time (hours)'].apply(lambda x: f"{x:.1f}")

print(f"\n==== Cost Projection Summary for {target_comment_count:,} Comments ====")
print(summary_table[['Model', 'Projected Total Tokens', 'Projected Cost ($)', 'Projected Time (hours)']])

NameError: name 'plt' is not defined

## 5. Display and Export Results

Let's examine the sentiment analysis results and save our findings.

In [None]:
# Choose a model to examine the results
model_to_examine = models_to_test[0]  # Change index to examine different models

# Display sample of analysis results
if model_results[model_to_examine]["response"]:
    results_df = pd.DataFrame(model_results[model_to_examine]["response"])
    print(f"\n==== Sample Analysis Results from {model_to_examine} ====")
    display(results_df.head())
else:
    print(f"No valid results available for {model_to_examine}")

# Export the results
timestamp = time.strftime("%Y%m%d_%H%M%S")
export_filename = f"token_usage_estimate_{timestamp}.xlsx"

# Create Excel writer
with pd.ExcelWriter(export_filename) as writer:
    # Save the model comparison summary
    summary_df.to_excel(writer, sheet_name='Model Comparison', index=False)
    
    # Save the projection data
    projection_df.to_excel(writer, sheet_name='50k Projection', index=False)
    
    # Save sample results for each model
    for model in models_to_test:
        if model_results[model]["response"]:
            results_df = pd.DataFrame(model_results[model]["response"])
            results_df.to_excel(writer, sheet_name=f'{model[:10]} Results', index=False)

print(f"\nResults exported to {export_filename}")

# Close the database connection
con.close()
print("Database connection closed")

## Conclusion and Recommendations

Based on the token usage analysis and cost projections, you can make informed decisions about:

1. **Which model to use for the full 50,000 comment analysis**:
   - Balance between cost, speed, and quality of analysis
   - Consider gpt-4.1-nano for cost-efficiency or gpt-4o for best quality

2. **Batch size optimization**:
   - Adjust batch sizes to optimize API calls
   - Smaller batches allow for better error recovery but more API calls

3. **Budget planning**:
   - Use the projected costs to set an appropriate budget
   - Consider splitting the analysis across multiple days if needed

4. **Error handling strategy**:
   - Implement robust error handling for the full analysis
   - Save progress regularly to allow for resuming after errors

This notebook provides a reliable estimate of token usage and costs without having to process the entire dataset.