# Credit Analysis 
See README.md details

In [1]:
import os
if not os.environ['OPENAI_API_KEY']:
    raise Exception('Store your OpenAI Key in your enviroment')

In [2]:
import decimal
import json

# See docsumo.ipynb for code to access DocSumo.
cached_response_path = (
    "docsumo_api_responses/docsumo_58080714c51a4764ad0181565cad604e_response.json"
)
with open(cached_response_path) as f:
    data = json.load(f)["data"]

# Docsumo gives tells us how confident it is in its predictions.
# Only use confident predictions
minimum_confidence = 0.8
basic_information_dict = {
    k: v['orig_value']
    for k, v in data['Basic Information'].items()
    if v and v['confidence'] > minimum_confidence and k
}

# Docsumo creates a seperate list for each page of the PDF
line_items_by_page = data["Transactions"]

# Merge line items from each page into a single list
raw_line_items = [item for sublist in line_items_by_page.values() for item in sublist]

# Validate Docsumo's confidence in transaction data
for row in raw_line_items:
    for k, v in row.items():
        if v["confidence"] != 1.0:
            raise Exception('Docsumo had issue, and is not sure about transaction data')


# Strip out Docsumo meta data
line_items = [{k: v["value"] for k, v in item.items()} for item in raw_line_items]

# Convert currency to Decimal
for header in ["debit", "credit", "balance"]:
    for item in line_items:
        if item[header]:
            item[header] = decimal.Decimal(str(item[header]))

# Add an index, which we'll use to enrich data 
for i, line_item in enumerate(line_items):
    line_item["index"] = i

# Remove empty columns
for line_item in line_items:
    del line_item['merchant']
    del line_item['subcategory']
    del line_item['type']


# Calculate on how many days balance fell below 200.00
days_balance_below_200 = len(
    set(t['date'] for t in line_items if t['balance'] < decimal.Decimal('200.00'))
)

# Get average daily ending balance
daily_ending_balances = {t['date']: t['balance'] for t in line_items}

average_balance = sum(daily_ending_balances.values()) / len(daily_ending_balances) 


In [3]:
from collections import defaultdict

# CEO asked for monthly deposits and withdrawals
monthly_debits = defaultdict(lambda: decimal.Decimal("0.0"))
monthly_credits = defaultdict(lambda: decimal.Decimal("0.0"))

for item in line_items:
    date_parts = item["date"].split("/")
    month = date_parts[2] + "-" + date_parts[0]
    if item["debit"]:
        monthly_debits[month] += item["debit"]
    else:
        monthly_credits[month] += item["credit"]

months = sorted(list(set(monthly_debits.keys()).intersection(monthly_credits.keys())))
print("Month       Credits    Debits")
monthly_data_str = ""
for month in months:
    monthly_data_str += f"{month:8} {monthly_credits[month]:10.2f} {monthly_debits[month]:10.2f}\n"
print(monthly_data_str)

Month       Credits    Debits
2018-04    10671.20   10671.00
2018-05     9934.06    9583.00
2018-06     2189.19    2196.60
2018-07     8234.41    8554.56
2018-08     7501.00    3540.00
2018-09    26451.63   23234.89
2018-10    68944.89   21062.61
2018-11      408.90   55479.00
2018-12    14380.33   14317.00
2019-01     5610.00    2607.60
2019-02     7567.34   10612.00
2019-03     7847.81    7859.00



In [6]:
# Pass information to Chat gpt
import re
from typing import Dict, List, Union
from openai import OpenAI


def extract_and_convert_json(input_str) -> Union[Dict, List]:
    """
    Return the first valid JSON in the text block.
    ChatGPT sometimes comments on the data, rather than returning JSON asked. 
    """
    # Regular expression to find a JSON substring.
    # This pattern looks for the JSON structure starting with either a list ([) or an object ({)
    # and continues until the corresponding closing bracket (]) or brace (}).
    json_pattern = re.compile(r"(\{.*?\}|\[.*?\])(?![^\[]*\])", re.DOTALL)

    # Search for JSON substring in the input string
    matches = json_pattern.findall(input_str)
    if matches:
        # Match first JSON valid substring found
        json_str = matches[0]
        try:
            # Convert the JSON string to a dict or list
            json_data = json.loads(json_str)
            return json_data
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None
    else:
        raise Exception("No JSON found in input")


client = OpenAI()


def get_likelihood_of_being_lender(transactions: List[Dict]) -> List[Dict[str, Union[int, float, str]]]:
    """Return the  likelihood that a transaction is with a lender.
    Returns a list of dicts with an index for matching back to line_items.
    """

    prompt_content = """
    You analyze a list of transactions. The transactions are for a person in India. 
    Rewiew the transactions. Return a JSON, a list of dicts with three keys:
    - "index": the index of the transaction
    - "is_lender_likelihood": likelihood [0-1.0] that transaction is with a lender
    - "reason": reason for this estimate if estimate greater than 0. Limit the output of this str to 100 chars.

    - It is possible none of the transactions are with a lender. 

    For example, when processing 
    ..., {"index": 2, "debit": "1912.0", "credit": None, "balance": None, "description": 'NACH/TP ACH Bajaj Finanac/88551679'}, ...
    you would return 
    ..., {"index": 2, "is_lender_likelihood": 1.00, "reason": "Bajaj Finserve is a popular micro finance lender. NACH/TP is commonly used for loan disbursements"}, ...
    
    Your output must be JSON.  Do not add additional commentary because it will break the JSON parser.
    """

    cleaned_transactions = [
        {
            "index": t["index"],
            "debit": str(t["debit"]) if t["debit"] else None,
            "credit": str(t["credit"]) if t["credit"] else None,
            "balance": str(t["balance"]) if t["balance"] else None,
            "description": t["description"],
        }
        for t in transactions
    ]
    transactions_json = json.dumps(cleaned_transactions)
    # Print output so we can see progress
    print(f"get_likelihood_of_being_lender() called on {transactions_json[:5]}...")

    completion = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {
                "role": "system",
                "content": "You are system for categorizing information in bank statements .",
            },
            {"role": "user", "content": prompt_content + transactions_json},
        ],
    )
    print("Received response from OpenAI")
    try:
        s = completion.choices[0].message.content.replace("\n", "")
        return extract_and_convert_json(s)

    except:
        print("errored out returning completion")
        return completion


# line_items[0]

In [None]:
# In batches of 10, use OpenAI to determine likelihood each transaction is with a lender
num_subsets = len(line_items) // 10 + 1
for i in range(num_subsets):
    line_item_subset = line_items[10 * i : 10 * i + 10]
    likelihoods = get_likelihood_of_being_lender(line_item_subset)
    for obj in likelihoods:
        line_items[obj["index"]]["is_lender_likelihood"] = obj["is_lender_likelihood"]
        line_items[obj["index"]]["is_lender_likelihood_reason"] = obj["reason"]
    print(f"Added items to subset ({i} / {num_subsets}")

In [None]:
# biggest transactions
biggest_transactions = [
    {
        'description': t['description'], 
        'credit': t['credit'],
        'debit': t['debit'],
        'is_lender_likelihood': t['is_lender_likelihood'],
        'size_relative_to_average_balance': float((t['credit'] if t['credit'] else t['debit']) / average_balance),
    }
for t in sorted(line_items, key=lambda t: t['credit'] if t['credit'] else t['debit'], reverse=True)
][:5]
biggest_transactions

In [None]:
from datetime import datetime

# Compute total debits, credits to lenders
credits_to_lenders = sum(i['credit'] for i in line_items if i['is_lender_likelihood'] and i['credit'])
print('Credits to lenders', credits_to_lenders)
debits_to_lenders = sum(i['debit'] for i in line_items if i['is_lender_likelihood'] and i['debit'])
print('Debits to lenders', debits_to_lenders)
non_lender_volume = sum(i['credit'] if i['credit'] else i['debit'] for i in line_items if not i['is_lender_likelihood'])
lender_volume_to_nonlender_volume = (debits_to_lenders + credits_to_lenders) / non_lender_volume if non_lender_volume else 'N/A Automatic rejection no non-lender credits'
print(f'lender_volume_to_nonlender_valume: {lender_volume_to_nonlender_volume:.4f}')

period_start_date_str = data['Basic Information']['Start Date']['orig_value']
period_start_date = datetime.strptime(period_start_date_str, '%m/%d/%Y').date()
period_end_date_str = data['Basic Information']['End Date']['orig_value']
period_end_date = datetime.strptime(period_end_date_str, '%m/%d/%Y').date()
days_in_period = (period_end_date - period_start_date).days + 1





In [None]:
def get_str_from_dict(d) -> str:
    """Convert nested dict with decimals to str.
    Example: 
    Input: {'transactions': [{'credits': decimal.Decimal("1.23")}]}
    Output {'transactions': [{'credits': 1.23}]}
    """
    
    def _str_from_dict(obj):
        if isinstance(obj, dict):
            return {k: _str_from_dict(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [_str_from_dict(x) for x in obj]
        elif isinstance(obj, decimal.Decimal):
            return round(float(obj), 4)
        elif isinstance(obj, float):
            return round(obj, 4)
        else:
            return obj
        
    processed_d = _str_from_dict(d)
    return json.dumps(processed_d, indent=4)



In [36]:
system_content = """
Your job is to write a report regarding whether to make a loan to a loan applicant.
You are only given information from a single bank statement. This is limiting and you will comment on how this is limiting,
but you will still draw a conclusions.

You are given
- the sum of debits and credits for each month
- the total credits paid to lenders
- the total debits paid to lenders
- days in period
- days with a balance below 200 dollars.

A Debit is cash flowing out of the users account.
A Credit is cash flowing into the account.

The maximum length of the report should be 500 words.
The report should be formatted as markdown.
- Display a bulletted list of largest transactions.
- Comments on the volume of transactions with lenders, relative to the total volume of transactions.
- Do not include an images are references to outside urls.

Your output must be JSON.  Do not add additional commentary because it will break the JSON parser.
"""

detail = {
        "monthly_transactions": monthly_data_str,
        "total_credits_to_lenders": credits_to_lenders,
        "total_debits_to_lenders": debits_to_lenders,
        "lender_volume_to_nonlender_volume": lender_volume_to_nonlender_volume,
        "days_in_period": days_in_period,
        "days_balance_below_200": days_balance_below_200,
        "low_balance_ratio": days_balance_below_200 / days_in_period,
        "biggest_transactions": biggest_transactions
}

detail_str = get_str_from_dict(detail)

completion = client.chat.completions.create(
    model="gpt-4",
    messages=[
        {
            "role": "system",
            "content": system_content
        },
        {
            "role": "user", "content": detail_str
        },
    ],
)
print("Received response from OpenAI")
try:
    s = completion.choices[0].message.content

except:
    print("errored out returning completion")
    s = completion
print(s)

Received response from OpenAI
# Loan Applicant Assessment Report

The following analysis has been conducted based on the data predominantly extracted from a single bank statement, which may not fully represent the financial behavior or creditworthiness of the applicant over a longer period of time. Therefore, the conclusion should be considered within this limited scope.

## Largest Transactions
- A credit of 47896.0 from JANALAKSHMI FINA with a 21.7062 size relative to average balance.
- A debit of 47800.0 paid to NANGARE with a 21.6627 size relative to average balance.
- A credit of 9000.0 from a Fund Transfer, has a 4.0787 size relative to average balance.
- A debit of 8320.0 paid towards Reliance Energy Bill, with a 3.7706 size relative to average balance.
- A debit of 8204.21 paid at FINNOVATION TECH SOLUT, with a 3.7181 size relative to average balance.

## Lenders vs Non-Lenders Transactions
The total credits received from lenders were 66080.0 while debits paid to lenders were 1

In [37]:
import markdown
# Convert Markdown to HTML

text = completion.choices[0].message.content
html_content = markdown.markdown(text)

# Convert HTML to PDF
from IPython.core.display import display, HTML
display(HTML(html_content))


  from IPython.core.display import display, HTML


In [38]:

# Use report lab to save html_content to 'credit_analysis.pdf'
# Output retains the HTML formatting.
from xhtml2pdf import pisa
import io

file_name = 'credit_analysis.pdf'

# Create a file-like object to hold the pdf data
pdf_file = io.BytesIO()

# Convert the HTML to PDF
pisa_status = pisa.CreatePDF(html_content, dest=pdf_file)

# If there was an error, print it
if pisa_status.err:
    print("There was an error converting HTML to PDF")

# Write the PDF data to a file
with open(file_name, 'wb') as output_file:
    output_file.write(pdf_file.getvalue())

# open file using system file viewer
import os
os.system(f'open "{file_name}"')

0