In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import os
import re

In [2]:
def read_excel_files_from_folder(folder_path):
    
    # Check if the folder exists
    if not os.path.isdir(folder_path):
        print(f"Error: The folder at '{folder_path}' does not exist.")
        return {}

    excel_data = {}

    # Loop through all files in the specified folder
    for filename in os.listdir(folder_path):
        
        # Check if the file is an Excel file
        if filename.endswith('.xlsx') or filename.endswith('.xls'):
            
            # Construct the full file path
            file_path = os.path.join(folder_path, filename)
            
            try:
                # Read the Excel file into a pandas DataFrame
                df = pd.read_excel(file_path)
                excel_data[filename] = df
            except Exception as e:
                print(f"Could not read {filename}: {e}")
                
    return excel_data

data_folder = 'data'
all_excel_dfs = read_excel_files_from_folder(data_folder)

transactions = pd.concat(all_excel_dfs.values())
transactions['Date'] = pd.to_datetime(transactions['Date'], format='%d/%m/%y', errors='coerce')

transactions = transactions.sort_values('Date', ascending= False)


In [51]:
# Create a dictionary of rules for withdrawals
withdrawal_rules = {
    'Bank Transfer': ['fast payment / receipt', 'funds transfer', 'remittance', 'trfsb', 'fast'],
    'Cash Withdrawal': ['cash withdrawal'],
    'Health and Fitness': ['ezypaysgd*anytime', 'chew hwee ping'],
    'Food': ['paylah!'],
    'Investment': ['u6473439'],
    'Bills': [],
    'Entertainment': []


    # 'Food & Groceries': ['supermarket', 'bakery', 'restaurant', 'cafe', 'food court'],
    # 'Transport': ['uber', 'grab', 'bus', 'mrt', 'petrol station'],
    # 'Bills': ['phone bill', 'electricity', 'internet'],
    # 'Shopping': ['shopee', 'lazada', 'amazon', 'department store'],
    # 'Health': ['pharmacy', 'clinic', 'hospital']
}

def categorize_withdrawal(text, rules):

    text_lower = text.lower()

    if 'funds transfer' in text_lower:
    # Check if any food and investment keywords are present.
        hawker_keywords = rules.get('Food', [])
        contains_hawker_keyword = any(keyword in text_lower for keyword in hawker_keywords)

        investment_keywords = rules.get('Investment', [])
        contains_investment_keyword = any(keyword in text_lower for keyword in investment_keywords)

        if contains_hawker_keyword:
            return 'Food'
        
        elif contains_investment_keyword:
            return 'Investment'
        
        else:
            return 'Bank Transfer'



    elif 'paynow' in text_lower:
        health_fitness = rules.get('Health and Fitness', [])
        contains_health_keyword = any(keyword in text_lower for keyword in health_fitness)

        if contains_health_keyword:
            return 'Health and Fitness'


        # Rule 2: Check for all other defined rules.
    for category, keywords in rules.items():
        # Iterate through the list of keywords for each category.
        for keyword in keywords:
            # We use a regex word boundary (\b) to ensure we match whole words,
            # which can prevent accidental matches in other words.
            if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower):
                return category

    # Fallback: If no rules match, return 'Other'.
    return 'Other'


# Create a dictionary of rules for deposits
deposit_rules = {
    'Salary': ['salary'],
    'Cash Deposit': ['cash deposit', 'cash'],
    'Government': ['mindef', 'gov', 'saf', 'gst voucher', 'govt'],
    'Bank Transfer': ['fast payment / receipt', 'funds transfer', 'remittance', 'trfsb', 'fast'],
    'Interest': ['interest'],
    'Refund': ['refund', 'debit card transaction']
}

def categorize_deposit(text, rules):
    """
    Categorizes a deposit description based on a set of rules.

    Args:
        text (str): The deposit description to categorize.
        rules (dict): A dictionary of rules where keys are categories and
                      values are lists of keywords.

    Returns:
        str: The category of the deposit.
    """
    text_lower = text.lower()

    # Rule 1: Check for the 'Other GIRO Deposits' condition first, as it's a
    # special case with a negative constraint.
    if 'payments / collections' in text_lower:
        # Check if any government keywords are present.
        gov_keywords = rules.get('Government', [])
        contains_gov_keyword = any(keyword in text_lower for keyword in gov_keywords)

        # If it contains 'payments / collections' AND does NOT contain any
        # government keywords, it's 'Other GIRO Deposits'.
        if not contains_gov_keyword:
            return 'Other GIRO Deposits'

    # Rule 2: Check for all other defined rules.
    for category, keywords in rules.items():
        # Iterate through the list of keywords for each category.
        for keyword in keywords:
            # We use a regex word boundary (\b) to ensure we match whole words,
            # which can prevent accidental matches in other words.
            if re.search(r'\b' + re.escape(keyword) + r'\b', text_lower):
                return category

    # Fallback: If no rules match, return 'Other'.
    return 'Other'

transactions['Category'] = np.where(
    transactions['Deposit'].notna(),
    transactions['Description'].apply(lambda x: categorize_deposit(x, deposit_rules)),
    np.where(
        transactions['Withdrawal'].notna(),
        transactions['Description'].apply(lambda x: categorize_withdrawal(x, withdrawal_rules)),
        np.nan # Use np.nan for rows that are neither a deposit nor a withdrawal
    )
)


In [54]:
#transactions[transactions.Category == 'Health and Fitness']

In [55]:
pd.set_option('display.max_colwidth', None)
#transactions[transactions.Balance == 3268.27]
#transactions[transactions.Balance == 1227.44]

In [15]:
pd.set_option('display.max_rows', None)
# pd.reset_option('all')


In [None]:
def categorize_transaction(description, rules):
    """
    Categorizes a single transaction based on a set of rules.

    Args:
        description (str): The description of the transaction.
        rules (dict): A dictionary of categories and their associated keywords.

    Returns:
        str: The assigned category, or 'Uncategorized' if no match is found.
    """
    if pd.isna(description):
        return 'Uncategorized'
        
    # Check for keywords in a case-insensitive manner
    desc_lower = description.lower()
    for category, keywords in rules.items():
        if any(keyword in desc_lower for keyword in keywords):
            return category
            
    return 'Uncategorized'

In [None]:
# Apply the categorization to create new columns
transactions['Withdrawal_Category'] = transactions.apply(
    lambda row: categorize_transaction(row['descriptions'], withdrawal_rules) if row['withdrawal'] > 0 else 'N/A',
    axis=1
)

transactions['Deposit_Category'] = transactions.apply(
    lambda row: categorize_transaction(row['descriptions'], deposit_rules) if row['deposit'] > 0 else 'N/A',
    axis=1
)

# Display the first few rows with the new categories to verify
print(transactions.head())

In [33]:
#total = transactions[transactions['Withdrawal'].isin([90, 100, 110, 120, 130, 140, 150, 160])].sort_values(by = 'Date')
#total

In [34]:
#total['Withdrawal'].sum()