In [77]:
import openai
import os
import pandas as pd
import numpy as np
#pd.set_option('display.max_colwidth', None)
pd.set_option("max_colwidth", None)
#pd.set_option("max_rows", None)

In [3]:
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    openai.api_key  = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")

## Utility functions

In [50]:
def get_completion(transaction_prompt,
                   prompt_template,
                   system_prompt=None,
                   model="gpt-3.5-turbo",
                   max_tokens=250,
                   temperature=0,
                   top_p=1,
                   frequency_penalty=0,
                   presence_penalty=0):
    ''' openai chat completion object '''
    
    prompt_template = prompt_template.format(tran_desc=transaction_prompt)
    messages = [{"role": "user", "content": prompt_template}] 
    if system_prompt is not None:
        messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template},
            ]
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty
    )
    return response.choices[0].message["content"]

In [16]:
# adopted from https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

## Temporary sample data

In [11]:
df_ref = pd.read_csv('./100_labeled_consumer_transactions.csv')

In [95]:
for i in df_ref.sort_values(by='description', ascending=False).iterrows():
    if 'Wal' in i[1]['description'] or 'WAL' in i[1]['description']:
        print(i[0], i[1]['description'])

82 WAL-MART  Wal-  03/24 #000981637 PURCHASE                   WAL-MART  Wal-Mar  SAN DIEGO (C) CA


In [81]:
for i in df_ref.sort_values(by='description', ascending=False)[['description']]:
    print(i)
    if 'Wal' in i or 'WAL' in i:
        print(i)
        break

description


## PoC: get transaction category from transaction statement

In [47]:
system_prompt = "You are a financial assistant. You have to enrich transactions."

prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com

user_prompt:
{tran_desc}
"""

sample_transaction_statement = 'BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04'

In [51]:
get_completion(transaction_prompt= sample_transaction_statement,
                          prompt_template=prompt_template,
                          system_prompt=system_prompt,
                          model="gpt-3.5-turbo", max_tokens=500)

In [52]:
response

'Category: Bill Payment | Merchant: BMW | Website: N/A'

In [54]:
prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com
Transaction: BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04
Category: Auto Payment | Merchant: BMW | bmw.com

user_prompt:
{tran_desc}
"""

sample_transaction_statement = '0173 AMC MESQUITE 23 MESQUITE TX 01/06'

In [55]:
get_completion(transaction_prompt= sample_transaction_statement,
                          prompt_template=prompt_template,
                          system_prompt=system_prompt,
                          model="gpt-3.5-turbo", max_tokens=500)

'Category: Entertainment | Merchant: AMC | Website: amctheatres.com'

In [56]:
sample_transaction_statement = "Trader Joe's #012 Qps Sanfrancisco Ca"
get_completion(transaction_prompt= sample_transaction_statement,
                          prompt_template=prompt_template,
                          system_prompt=system_prompt,
                          model="gpt-3.5-turbo", max_tokens=500)

"Category: Food & Drink | Merchant: Trader Joe's | Website: traderjoes.com"

In [60]:
prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com
Transaction: BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04
Category: Auto Payment | Merchant: BMW | bmw.com
Transaction: Trader Joe's #012 Qps Sanfrancisco Ca
Category: Grocery | Merchant: Trader Joe's | traderjoes.com

user_prompt:
{tran_desc}
"""

In [61]:
sample_transaction_statement = "WAL-MART  Wal-  03/24 #000981637 PURCHASE"
get_completion(transaction_prompt= sample_transaction_statement,
                          prompt_template=prompt_template,
                          system_prompt=system_prompt,
                          model="gpt-3.5-turbo", max_tokens=500)

'Category: Retail | Merchant: Wal-Mart | Website: walmart.com'

In [None]:
def extract_response(self, pred: str) -> dict:
        pred = pred.replace(self.QUESTION_PROMPT, "").strip()
        if pred[-1] in [".", ";"]:  # Case when LLM add a punctuation point at the end
            pred = pred[:-1]
        # If output doesn't match the list of possible labels,
        # replace with "Not enough information"
        if pred not in self.ALL_LABELS:
            warnings.warn(
                f"LLM answer: '{pred}' not in label list, replacing by '{NOT_ENOUGH_INFO_LABEL}'."
            )
            pred = NOT_ENOUGH_INFO_LABEL
        return {"labels": pred}