In [20]:
# Import necessary libraries
import openai
import pandas as pd
import tiktoken  # For token counting
import math
import time

In [21]:
# Import the API key from config.py
from config import get_openai_api_key

# Set the OpenAI API key
openai.api_key = get_openai_api_key()

# Initialize the encoder for the gpt-4o model
encoding = tiktoken.encoding_for_model("gpt-4o")

In [23]:
def estimate_tokens(text):
    return len(encoding.encode(text))

def estimate_cost(total_tokens, model="gpt-4o"):
    # OpenAI pricing per 1,000 tokens
    pricing = {
        "gpt-4o": {
            "input": 2.50 / 1000,     # $0.0025 per 1K input tokens
            "output": 2.50 / 1000,    # $0.0025 per 1K output tokens
        },
        # Add other models and their pricing
    }

    model_pricing = pricing.get(model, {"input": 0.002, "output": 0.002})
    cost = {
        "input": (total_tokens['input'] / 1000) * model_pricing['input'],
        "output": (total_tokens['output'] / 1000) * model_pricing['output'],
        "total": 0
    }
    cost['total'] = cost['input'] + cost['output']
    return cost

In [24]:
def load_data(input_data):
    if isinstance(input_data, list):
        # If input is a list, convert it to a DataFrame
        fund_df = pd.DataFrame({'fund_name': input_data})
        fund_df['context'] = ''
    elif isinstance(input_data, pd.DataFrame):
        # Ask user for column names
        print("Available columns:", list(input_data.columns))
        name_column = input("Enter the column name for fund names: ").strip()
        context_columns = input("Enter additional context column names separated by commas (or leave blank): ").split(',')

        # Clean up context columns list
        context_columns = [col.strip() for col in context_columns if col.strip()]

        # Ensure specified columns exist
        required_columns = [name_column] + context_columns
        for col in required_columns:
            if col not in input_data.columns:
                raise ValueError(f"Column '{col}' not found in the DataFrame.")

        # Create a DataFrame with necessary columns
        fund_df = input_data[required_columns].copy()

        # Combine context columns into one if any
        if context_columns:
            fund_df['context'] = fund_df[context_columns].astype(str).agg(' '.join, axis=1)
        else:
            fund_df['context'] = ''
        
        fund_df.rename(columns={name_column: 'fund_name'}, inplace=True)
    else:
        raise ValueError("Input data must be a list or a pandas DataFrame.")
    
    return fund_df

In [27]:
def create_prompt(fund_df, classifications=None):
    prompt = (
        "Classify the following funds into their respective fund types. "
        "Use the provided classifications if applicable. "
        "For each fund, return a JSON array with 'fund_name', 'classification', "
        "and a 'reason' (less than 5 words) for the classification.\n\n"
    )
    if classifications:
        prompt += f"Possible Classifications: {', '.join(classifications)}\n\n"
    prompt += "Funds:\n"

    for idx, row in fund_df.iterrows():
        line = f"{row['fund_name']}"
        if row['context']:
            line += f" | Context: {row['context']}"
        prompt += f"{idx+1}. {line}\n"

    return prompt

def classify_funds(fund_df, model="gpt-4o", classifications=None):
    # Create the prompt
    prompt = create_prompt(fund_df, classifications)

    # Estimate tokens for the prompt
    prompt_tokens = estimate_tokens(prompt)

    # Maximum allowed tokens for the model
    max_context_length = 128000  # For gpt-4o
    max_response_tokens = min(max_context_length - prompt_tokens, 2000)  # Limit max tokens for response

    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_response_tokens,
            temperature=0.0,
        )

        # Extract the assistant's reply
        reply = response['choices'][0]['message']['content']
        usage = response['usage']
        return reply, usage

    except Exception as e:
        print(f"Error processing request: {e}")
        return None, {'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0}

def parse_response(reply):
    import json

    try:
        # Clean up the reply if necessary
        reply = reply.strip()
        # Load JSON from the reply
        results = pd.read_json(reply)
        return results
    except ValueError as e:
        print(f"Error parsing JSON: {e}")
        print("Raw reply:", reply)
        return pd.DataFrame()

In [28]:
# Example usage
if __name__ == "__main__":
    # Input data can be a list or a DataFrame

    input_data = ["Global Equity Fund", "Emerging Markets Bond Fund", "Tech Growth Fund"]

    # Load and preprocess data
    fund_df = load_data(input_data)

    # Provide classifications if available
    classifications = [
        'Equity', 'Fixed Income', 'Balanced', 'Money Market',
        'Commodity', 'Real Estate', 'Alternative Investments', 'Technology', 'Emerging Markets'
    ]

    # Estimate tokens and cost before proceeding
    prompt = create_prompt(fund_df, classifications)
    prompt_tokens = estimate_tokens(prompt)
    # Estimate expected output tokens (adjust as needed)
    estimated_output_tokens = 500
    total_estimated_tokens = {'input': prompt_tokens, 'output': estimated_output_tokens}
    estimated_cost = estimate_cost(total_estimated_tokens, model="gpt-4o")

    print(f"Estimated input tokens: {total_estimated_tokens['input']}")
    print(f"Estimated output tokens: {total_estimated_tokens['output']}")
    print(f"Estimated total cost: ${estimated_cost['total']:.4f}")
    print(f" - Input cost: ${estimated_cost['input']:.4f}")
    print(f" - Output cost: ${estimated_cost['output']:.4f}")

    # Decide whether to proceed based on estimated cost
    proceed = False  # Set to False if you want to halt based on cost
    if proceed:
        # Classify funds
        reply, usage = classify_funds(fund_df, model="gpt-4o", classifications=classifications)

        # Update actual token usage
        actual_tokens_used = {'input': usage['prompt_tokens'], 'output': usage['completion_tokens']}
        actual_cost = estimate_cost(actual_tokens_used, model="gpt-4o")
        print(f"Actual input tokens used: {actual_tokens_used['input']}")
        print(f"Actual output tokens used: {actual_tokens_used['output']}")
        print(f"Actual total cost: ${actual_cost['total']:.4f}")
        print(f" - Input cost: ${actual_cost['input']:.4f}")
        print(f" - Output cost: ${actual_cost['output']:.4f}")

        # Parse the response
        final_df = parse_response(reply)

        # Display the results
        if not final_df.empty:
            display(final_df)
        else:
            print("No results to display.")
    else:
        print("Process halted due to estimated cost.")

Estimated input tokens: 98
Estimated output tokens: 500
Estimated total cost: $0.0015
 - Input cost: $0.0002
 - Output cost: $0.0013
Process halted due to estimated cost.
