In [1]:
import openai
import os
import pandas as pd
import numpy as np
#pd.set_option('display.max_colwidth', None)
pd.set_option("max_colwidth", None)
#pd.set_option("max_rows", None)

In [2]:
from dotenv import load_dotenv, find_dotenv
found = load_dotenv(find_dotenv())
if found:
    openai.api_key  = os.getenv('OPENAI_API_KEY')
else:
    print("couldn't find the key")

## Utility functions

In [3]:
def get_completion(prompt_template,
                   input_text=None,
                   system_prompt=None,
                   model="gpt-3.5-turbo",
                   max_tokens=250,
                   temperature=0,
                   top_p=1,
                   frequency_penalty=0,
                   presence_penalty=0, 
                   verbose=False):
    ''' openai chat completion object '''
    
    prompt_template = (prompt_template if input_text is None else prompt_template.format(input_text=input_text))
    
    messages = [{"role": "user", "content": prompt_template}] 
    if system_prompt is not None:
        messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": prompt_template},
            ]
    
    if verbose:
        print(f'system prompt:\n{system_prompt}\n')
        print(f'prompt template:\n {prompt_template}\n')

    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        presence_penalty=presence_penalty,
        max_tokens = max_tokens
    )
    return response.choices[0].message["content"]

In [36]:
# adopted from https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
    """Return the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model in {
        "gpt-3.5-turbo-0613",
        "gpt-3.5-turbo-16k-0613",
        "gpt-4-0314",
        "gpt-4-32k-0314",
        "gpt-4-0613",
        "gpt-4-32k-0613",
        }:
        tokens_per_message = 3
        tokens_per_name = 1
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif "gpt-3.5-turbo" in model:
        print("Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0613.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613")
    elif "gpt-4" in model:
        print("Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.")
        return num_tokens_from_messages(messages, model="gpt-4-0613")
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

## Temporary sample data

In [37]:
df_ref = pd.read_csv('./100_labeled_consumer_transactions.csv')

In [None]:
for i in df_ref.sort_values(by='description', ascending=False).iterrows():
    if 'Wal' in i[1]['description'] or 'WAL' in i[1]['description']:
        print(i[0], i[1]['description'])

In [None]:
for i in df_ref.sort_values(by='description', ascending=False)[['description']]:
    print(i)
    if 'Wal' in i or 'WAL' in i:
        print(i)
        break

## PoC: get transaction category from transaction statement

In [38]:
system_prompt = "You are a financial assistant. You have to enrich transactions."

prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com

user_prompt:
{input_text}
"""

input_text = 'BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04'

In [39]:
get_completion(prompt_template=prompt_template,
               input_text= input_text,
               system_prompt=system_prompt,
               model="gpt-3.5-turbo",
               max_tokens=500)

'Category: Bill Payment | Merchant: BMW | Website: N/A'

In [40]:
prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com
Transaction: BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04
Category: Auto Payment | Merchant: BMW | bmw.com

user_prompt:
{input_text}
"""

input_text = '0173 AMC MESQUITE 23 MESQUITE TX 01/06'

In [41]:
get_completion(prompt_template=prompt_template,
               input_text= input_text,
               system_prompt=system_prompt,
               model="gpt-3.5-turbo",
               max_tokens=500)

'Category: Entertainment | Merchant: AMC | Website: amctheatres.com'

In [42]:
input_text = "Trader Joe's #012 Qps Sanfrancisco Ca"
get_completion(prompt_template=prompt_template,
               input_text= input_text,
               system_prompt=system_prompt,
               model="gpt-3.5-turbo",
               max_tokens=500)

"Category: Food & Drink | Merchant: Trader Joe's | Website: traderjoes.com"

In [43]:
prompt_template = """
Given a bank transaction statement, extract the transaction category, Merchant and website.

Note: Use ' | ' as a seperator between transaction category and Merchant.

Examples:
Transaction: BEVERAGES & MOR GILROY CA 
Category: Food & Drink | Mechant: BevMo | website: bevmo.com
Transaction: EARLY PAY: PETCO ANIMAL SUP DIRECT DEP 661021916777KVV 012243
Category: Paycheck | Mechant: Petco | website: petco.com
Transaction: Acorns Investing Transfer 123980 JIKCR7 Cristiano Pombal
Category: Investment | Mechant: Aconrs | website: acorns.com
Transaction: USAA CREDIT CARD PAYMENT 9926283729231 WEB ID: 12987121
Category: Credit Card Bill | Merchant: USAA | usaa.com
Transaction: BILL PAY BMW R18 CLASSIC RECURRING xxxxxx9876 ON 01-04
Category: Auto Payment | Merchant: BMW | bmw.com
Transaction: Trader Joe's #012 Qps Sanfrancisco Ca
Category: Grocery | Merchant: Trader Joe's | traderjoes.com

user_prompt:
{input_text}
"""

In [44]:
input_text = "WAL-MART  Wal-  03/24 #000981637 PURCHASE"
get_completion(prompt_template=prompt_template,
               input_text= input_text,
               system_prompt=system_prompt,
               model="gpt-3.5-turbo",
               max_tokens=500)

'Category: Retail | Merchant: Wal-Mart | Website: walmart.com'

In [None]:
def extract_response(self, pred: str) -> dict:
        pred = pred.replace(self.QUESTION_PROMPT, "").strip()
        if pred[-1] in [".", ";"]:  # Case when LLM add a punctuation point at the end
            pred = pred[:-1]
        # If output doesn't match the list of possible labels,
        # replace with "Not enough information"
        if pred not in self.ALL_LABELS:
            warnings.warn(
                f"LLM answer: '{pred}' not in label list, replacing by '{NOT_ENOUGH_INFO_LABEL}'."
            )
            pred = NOT_ENOUGH_INFO_LABEL
        return {"labels": pred}

### create fake transactions 

In [30]:
prompt_template = """
Generate a list of diverse banking transactions for a fictional character named Robert King,
who is a CEO of a hedge fund in San Francisco. He is wealthy and has various banking activities. 
Create transactions that reflect activities like tax payments, salary credits, payroll, loan payments, 
benefits, investments, and app-related transfers, such as Bill pay, Venmo, PayPal, Apple Cash, and more.

Maintain privacy and use a standard financial transaction format. 
Include different transaction types and channels. Out of every 10 transactions, 7
should be standard financial activities, and 3 should be fun references to famous movies,
but ensure they are still plausible as bank transactions. 

Avoid repetition, and do not use the same transaction type more than twice. 
Below are the details for each transaction type:

1. ATM transactions - withdrawals and deposits at various locations in the US mostly California and New York.
2. ACH payroll - salary and wages payments.
3. Online bill payments - payments for utilities, credit cards, and other services.
4. App-related payments - casual payments to friends or for services using apps like Venmo.
5. Tax payments - payments to the IRS and local tax authorities.
6. Loan payments - mortgage or car loan payments.
7. Investment activities - purchases or sales of stocks and other securities.
8. Cashier's checks, wire transfers, and remote deposits.
9. Point-of-Sales (POS) purchases

Examples are as follows:

'ATM WITHDRAWAL #1278 69 AWESOME AVE MIAMI FL TERMIL NH299912'.
'ATM DEPOSIT #1111 22 S MARKET STREET SANFRANCISCO CA TERMIL'.
'ACH DEBIT PAYROLL PAYROLL ROBERT KING'.
'ACH CREDIT GOOGLE DIRECT DEP , ROBERT KING'.
'POS PURCHASE #0755 UBER TECHNOLOGIES, INC, SANFRANCISCO, CA'
'POS PURCHASE #9876 7-ELEVEN FAIRFAX CA'
'ZELLE CREDIT PAYMENT FROM: HENRY FORD'
'ZELLE DEBOT PAYMENT TO: SHELDON BIG'
'ACH DEBIT IRS USATAXPYMT ROBERT KING'
'INCOMING WIRE FIDELITY NATIONAL TITLE COMPANY'
'AUTO TRANSFER TO LN TRANSFER TO SCHECUELED LOAN PAYMNET XXXXX212'
'DOMESTIC WIRE SEQUOIA MORTGAGE T R 444'
'CASHIERS CHECK WD TLR 55 BR 43 5567777 J&G INVESTMENT FUND'
'REMOTE DEPOSIT KING 40'
'ACH DEPIT PAYPAL INST XFER JULIA ROBERTS'

Now, create 10 diverse transactions for Robert King using the above instructions and the specified formats.

"""

In [19]:
results_gpt_4 = get_completion(prompt_template=prompt_template,
               input_text=None,
               system_prompt=None,
               model="gpt-4",
               max_tokens=3000, verbose=True)

system prompt:
None

prompt template:
 
Generate a list of diverse banking transactions for a fictional character named Robert King,
who is a CEO of a hedge fund in San Francisco. He is wealthy and has various banking activities. 
Create transactions that reflect activities like tax payments, salary credits, payroll, loan payments, 
benefits, investments, and app-related transfers, such as Bill pay, Venmo, PayPal, Apple Cash, and more.

Maintain privacy and use a standard financial transaction format. 
Include different transaction types and channels. Out of every 10 transactions, 7
should be standard financial activities, and 3 should be fun references to famous movies,
but ensure they are still plausible as bank transactions. 

Avoid repetition, and do not use the same transaction type more than twice. 
Below are the details for each transaction type:

1. ATM transactions - withdrawals and deposits at various locations in the US mostly California and New York.
2. ACH payroll - salary

In [16]:
results = get_completion(prompt_template=prompt_template,
               input_text=None,
               system_prompt=None,
               model="gpt-3.5-turbo",
               max_tokens=3000, verbose=True)

system prompt:
None

prompt template:
 
Generate a list of diverse banking transactions for a fictional character named Robert King,
who is a CEO of a hedge fund in San Francisco. He is wealthy and has various banking activities. 
Create transactions that reflect activities like tax payments, salary credits, payroll, loan payments, 
benefits, investments, and app-related transfers, such as Bill pay, Venmo, PayPal, Apple Cash, and more.

Maintain privacy and use a standard financial transaction format. 
Include different transaction types and channels. Out of every 10 transactions, 7
should be standard financial activities, and 3 should be fun references to famous movies,
but ensure they are still plausible as bank transactions. 

Avoid repetition, and do not use the same transaction type more than twice. 
Below are the details for each transaction type:

1. ATM transactions - withdrawals and deposits at various locations in the US mostly California and New York.
2. ACH payroll - salary

In [18]:
print(results)

1. ATM WITHDRAWAL #1234 123 MAIN ST SAN FRANCISCO CA TERMIL
2. ACH CREDIT PAYROLL PAYMENT - ROBERT KING
3. ONLINE BILL PAYMENT - ELECTRICITY BILL - ROBERT KING
4. VENMO PAYMENT TO JOHN DOE - DINNER AT CHEZ PIERRE
5. TAX PAYMENT - IRS - ROBERT KING
6. LOAN PAYMENT - MORTGAGE - ROBERT KING
7. STOCK PURCHASE - APPLE INC. - ROBERT KING
8. WIRE TRANSFER - INTERNATIONAL - ROBERT KING
9. POS PURCHASE #5678 - GROCERY STORE - SAN FRANCISCO CA
10. CASHIER'S CHECK - PAYMENT TO CHARITY - ROBERT KING


In [20]:
print(results_gpt_4)

1. 'ATM WITHDRAWAL #3456 123 BROADWAY NEW YORK NY TERMIL NH345612'
2. 'ACH DEBIT PAYROLL PAYROLL ROBERT KING'
3. 'ONLINE BILL PAYMENT #7890 PG&E UTILITIES, SAN FRANCISCO, CA'
4. 'VENMO DEBIT PAYMENT TO: JOHN WICK FOR DOG SITTING'
5. 'ACH DEBIT IRS USATAXPYMT ROBERT KING'
6. 'AUTO TRANSFER TO LN TRANSFER TO SCHEDULED LOAN PAYMENT XXXXX345'
7. 'INVESTMENT ACTIVITY #5678 PURCHASE OF TESLA STOCKS'
8. 'DOMESTIC WIRE TRANSFER #1234 TO GOLDMAN SACHS FOR INVESTMENT'
9. 'POS PURCHASE #4567 STARBUCKS, SAN FRANCISCO, CA'
10. 'APPLE CASH CREDIT PAYMENT FROM: TONY STARK FOR POKER NIGHT WINNINGS'


In [31]:
prompt_template = """
Here are a series of banking transactions for a fictional individual named Robert King, 
who is CEO of a hedge fund in San Francisco and serves on multiple local boards. 
His transactions are diverse and include tax payments, salary credits, payroll processing, 
loan payments, benefits, investments, and various forms of online banking and app-related payments 
like Venmo, PayPal, and Apple Cash. For a touch of creativity, include occasional references to 
famous movies, with the ratio of about two movie-related transactions to every ten real-world banking transactions. 
Here are some examples:

1. ATM WITHDRAWAL #1278 69 AWESOME AVE MIAMI FL TERMIL NH299912
2. ATM DEPOSIT #1111 22 S MARKET STREET SANFRANCISCO CA TERMIL
3. ACH DEBIT PAYROLL PAYROLL ROBERT KING
4. ACH CREDIT GOOGLE DIRECT DEP, ROBERT KING
5. POS PURCHASE #0755 UBER TECHNOLOGIES, INC, SANFRANCISCO, CA
6. POS PURCHASE #9876 7-ELEVEN FAIRFAX CA
7. ZELLE CREDIT PAYMENT FROM: HENRY FORD
8. ZELLE DEBIT PAYMENT TO: SHELDON BIG
9. ACH DEBIT IRS USATAXPYMT ROBERT KING
10. INCOMING WIRE FIDELITY NATIONAL TITLE COMPANY

Please continue to create a similar list of transactions for Robert King, making sure each one is clear, 
plausible, and respects privacy standards while maintaining a standard format for financial statements. 
Remember to not repeat similar transactions more than twice and to keep the playful movie references subtle 
and infrequent.
"""

In [33]:
result_GPT3_new_prompt = get_completion(prompt_template=prompt_template,
               input_text=None,
               system_prompt=None,
               model="gpt-3.5-turbo",
               max_tokens=3000, verbose=True)

system prompt:
None

prompt template:
 
Here are a series of banking transactions for a fictional individual named Robert King, 
who is CEO of a hedge fund in San Francisco and serves on multiple local boards. 
His transactions are diverse and include tax payments, salary credits, payroll processing, 
loan payments, benefits, investments, and various forms of online banking and app-related payments 
like Venmo, PayPal, and Apple Cash. For a touch of creativity, include occasional references to 
famous movies, with the ratio of about two movie-related transactions to every ten real-world banking transactions. 
Here are some examples:

1. ATM WITHDRAWAL #1278 69 AWESOME AVE MIAMI FL TERMIL NH299912
2. ATM DEPOSIT #1111 22 S MARKET STREET SANFRANCISCO CA TERMIL
3. ACH DEBIT PAYROLL PAYROLL ROBERT KING
4. ACH CREDIT GOOGLE DIRECT DEP, ROBERT KING
5. POS PURCHASE #0755 UBER TECHNOLOGIES, INC, SANFRANCISCO, CA
6. POS PURCHASE #9876 7-ELEVEN FAIRFAX CA
7. ZELLE CREDIT PAYMENT FROM: HENRY FORD

In [35]:
print(result_GPT3_new_prompt)

11. ATM WITHDRAWAL #1357 123 MAIN STREET SAN FRANCISCO CA TERMIL
12. ATM DEPOSIT #2222 456 PARK AVENUE SAN FRANCISCO CA TERMIL
13. ACH DEBIT PAYROLL PAYROLL ROBERT KING
14. ACH CREDIT APPLE DIRECT DEP, ROBERT KING
15. POS PURCHASE #0987 STARBUCKS, SAN FRANCISCO, CA
16. POS PURCHASE #5432 WHOLE FOODS MARKET, SAN FRANCISCO, CA
17. ZELLE CREDIT PAYMENT FROM: LUCY WILLIAMS
18. ZELLE DEBIT PAYMENT TO: JAMES ANDERSON
19. ACH DEBIT IRS USATAXPYMT ROBERT KING
20. INCOMING WIRE JP MORGAN CHASE BANK

21. ATM WITHDRAWAL #2468 789 OCEAN AVENUE SAN FRANCISCO CA TERMIL
22. ATM DEPOSIT #3333 987 SUNSET BOULEVARD SAN FRANCISCO CA TERMIL
23. ACH DEBIT PAYROLL PAYROLL ROBERT KING
24. ACH CREDIT AMAZON DIRECT DEP, ROBERT KING
25. POS PURCHASE #6543 NETFLIX, SAN FRANCISCO, CA
26. POS PURCHASE #2109 TARGET, SAN FRANCISCO, CA
27. ZELLE CREDIT PAYMENT FROM: EMILY JOHNSON
28. ZELLE DEBIT PAYMENT TO: MICHAEL THOMPSON
29. ACH DEBIT IRS USATAXPYMT ROBERT KING
30. INCOMING WIRE BANK OF AMERICA

31. ATM WITHDRAWAL

In [5]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
len(encoding.encode(prompt_template))

188

### Predefined categories

In [None]:
{
    "Earned Income": [
        "Freelance", 
        "Paycheck", 
        "Rideshare and delivery"
    ], 
    "Essential Expenses": [
        "Auto lease payment", 
        "Auto loan repayment", 
        "Childcare", 
        "Contribution to reserve fund", 
        "Council tax", 
        "Credit card bill", 
        "Credit card fee", 
        "Credit report", 
        "Debt collection", 
        "Drugstores and pharmacies", 
        "Education", 
        "Fuel", 
        "Funerals and bequests", 
        "Government", 
        "Groceries", 
        "Insurance", 
        "Interest", 
        "Loan repayment", 
        "Medical bill", 
        "Mortgage", 
        "Other transport", 
        "Pets", 
        "Public transport", 
        "Rent and property management fee", 
        "Retirement contributions", 
        "Ridesharing and taxis", 
        "Student loan repayment", 
        "Taxes", 
        "Utilities", 
        "Vehicle maintenance"
    ], 
    "Non-Essential Expenses": [
        "ATM/bank withdrawal", 
        "App stores", 
        "Bank fee", 
        "Books, newsletters, newspapers", 
        "Buy now, pay later", 
        "Cafes and coffee shops", 
        "Clothing", 
        "Convenience stores", 
        "Department and discount stores", 
        "Donation", 
        "Electronics", 
        "Entertainment and recreation", 
        "Firearms", 
        "Food and Drink", 
        "Gambling", 
        "Gifts", 
        "Home improvements and maintenance services", 
        "Hotels and lodging", 
        "Inter account transfer", 
        "Intra account transfer", 
        "Investment", 
        "Laundry", 
        "Legal services", 
        "Liquor", 
        "Media", 
        "Other consumer services", 
        "Other non-essential", 
        "Pawn shops", 
        "Peer to peer transfer", 
        "Recreational goods", 
        "Rent to own", 
        "SaaS tools", 
        "Self care", 
        "Sport and fitness", 
        "Toll charge", 
        "Towing companies", 
        "Trading (crypto)", 
        "Trading (non-crypto)", 
        "eCommerce purchase"
    ], 
    "Other Incoming Transactions": [
        "ATM/bank deposit", 
        "Cashback", 
        "Chargeback", 
        "Ecommerce", 
        "Grants and stipends", 
        "Insurance", 
        "Inter account transfer", 
        "Intra account transfer", 
        "Loans", 
        "Not enough information", 
        "Other", 
        "Pawn shops", 
        "Peer to peer transfer", 
        "Prenote", 
        "Refund", 
        "Reversal / adjustment", 
        "Tax refund", 
        "Trading (crypto)", 
        "Trading (non-crypto)", 
        "missing account holder information"
    ], 
    "Other Outgoing Transactions": [
        "Non-sufficient funds / Overdraft fee", 
        "Not enough information", 
        "Prenote", 
        "Reversal / adjustment", 
        "missing account holder information"
    ], 
    "Passive Income": [
        "Benefits", 
        "Interest / dividend", 
        "Property rental"
    ]
}

In [25]:
transaction_samples.split('\n')[1]

'1. `ACH CREDIT SALARY KING HEDGE FUND LLC ROBERT KING`'

In [2]:
with open('./synthetic_transactions.txt', 'r') as f:
    transaction_text = f.read()

In [4]:
test_trans = sorted(transaction_text.split('\n'))

In [9]:
import random
for i in range(5):
    print(random.choice(test_trans))

ACH DEBIT - CHARITY DONATION - SAVE THE REDWOODS LEAGUE
INCOMING PAYPAL TRANSFER - "SNAKE EYES" GAME APP PROCEEDS
INCOMING WIRE REFUND - CASABLANCA RESORT & SPA
MOBILE DEPOSIT - CHECK #2875 FROM "THE USUAL SUSPECTS" FILM ROYALTIES
VENMO PAYMENT TO: TONY STARK IRON SUITS
