In [1]:
import datetime
start = datetime.date(2025,12,1)
end = datetime.date(2025,12,31)

In [2]:
import numpy as np
import pandas as pd
import re
import time
import os.path
import shutil
from dateutil.rrule import rrule, MONTHLY

In [3]:
#generated_path = 'E:\\dtuklaptop\\e\\Users\\Mat\\python\\data\\property\\generated\\'
generated_path = 'J://My Drive//NAS//My Documents//Business//Property//Statements//working//python//data//property//generated//'

def backup_file(filename):
    timestr = time.strftime('%Y%m%d-%H%M%S')
    if os.path.isfile(filename):
        shutil.copy2(filename,filename+'.bak_' + timestr)
    return filename

def categorise_old_data(dfB):
    # Categorise expenses
    # Mortgage, PropertyExpense, OurRent, BealsRent, ServiceCharge
    # SchoolFees, Car, Hilltop, PersonalExpenses, RegularPayment
    dfB['Description'] = dfB['Description'].fillna('')
    
    # Identify Mortgages
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('TOPAZ|SIBERITE|SKIPTON|MORTGAGE EXPRESS|NRAM|PLATFORM|AMBER|BHAM|CAPITAL|CHL|MORTGAGE TRUST|PARAGON')==True),"Cat"] = 'Mortgage'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*M TUCKER.*STO.*')==True) & (dfB.Amount > -200) & (dfB.Amount <-190),"Cat"] = 'Mortgage' # Kingston Rd mortgage
    
    # Identify Rents
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('^BEALS[ ]?ESTATE[ ]?AGENT.*$|^CHARTERS.*$')==True),"Cat"] = 'BealsRent'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Description.notnull())&(dfB.Description.str.match('Rent for period')==True),"Cat"] = 'OurRent'
    dfB.loc[(dfB.Cat.isnull())&((dfB.Description.str.match('.*DEPOSIT.*|.*TDS.*',case=False)==True)|(dfB.Memo.str.match('.*TDS.*|.*DEPOSIT.*',case=False)==True)),"Cat"] = 'Deposit'
    #dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('DIRECTDEP|FT|PAYMENT|Direct Debit|Funds Transfer|Bill Payment')==True)&(dfB.Memo.str.match('.*DEPOSIT.*',case=False)==True),"Cat"] = 'Deposit'

    # Identify Property Expenses
    dfB.loc[(dfB.Cat.isnull())&(dfB.Description.notnull())&(dfB.Property.notnull())&(dfB.Property.str.match('.*MILTON.*|.*BEDFORD.*')==False)&(dfB.Description.str.match('Rent for period')==False),"Cat"] = 'PropertyExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*PORTSEA.*',case=False)==True),"Cat"] = 'PropertyExpense'
    
    # Identify Service Charges and Transfers between accounts
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('23 HAMPSHIRE.*STO|4-6 ALHAMBRA RD CS|12-14 ALHAMBRA RD|16-18 ALHAMBRA RD|ALHAMBRA ROAD MANA|Alhambra Road M Tucker SC|ALHHAMBRA ROAD M Tucker SC|Lordswood Estates M Tucker SC')==True),"Cat"] = 'ServiceCharge'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*30728691.*')==True),'Cat'] = 'Funds3072'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*40406538 .*')==True),'Cat'] = 'Funds4040'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*60458872.*')==True),'Cat'] = 'Funds6045'
    
    # Identify Regular Payments
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*HARPUR TRUST.*|.*BEDFORD SCHOOL.*|.*School[ ]?Fee.*')==True),"Cat"] = 'SchoolFee'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*G[ ]?[&+][ ]?F[ ]?VALENTINO.*')==True),"Cat"] = 'Hilltop'
    
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('DIRECTDEBIT|Direct Debit')==True)&(dfB.Memo.str.match('HMRC')==True),"Cat"] = 'HMRCDD'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('PAYMENT|Bill Payment')==True)&(dfB.Memo.str.match('PROPERTY[ ]?118.*')==True),"Cat"] = 'RSACapital'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('PAYMENT|Bill Payment')==True)&(dfB.Memo.str.match('HMRC')==True),"Cat"] = 'HMRC'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('PAYMENT|Bill Payment')==True)&(dfB.Memo.str.match('CCTV|Letterbox',case=False)==True),"Cat"] = 'AlhambraPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('PAYMENT|Bill Payment')==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('CASH|Cash Withdrawal|.*Card.*',case=False)==True),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Memo.str.match('.*Sofia.*|.*Rocco.*|.*Alessio.*',case=False)==True),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='CRE'),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='OTH'),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='BCC'),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('REFUND',case=False)==True),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='CHQ'),'Cat'] = 'PersonalExpense'

    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Memo.str.match('.*INTERCO.*',case=False)==True),'Cat'] = 'ServiceCharge'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Description.str.match('.*Alham.*|.*HAG.*',case=False)==True),'Cat'] = 'AlhambraPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Memo.str.match('.*13438023.*|.*83672832.*|.*23534332.*|.*13686035.*')==True),'Cat'] = 'AlhambraPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Memo.str.match('.*Alham.*|.*HAG.*',case=False)==True),'Cat'] = 'AlhambraPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('FT|Funds Transfer',case=False)==True)&(dfB.Memo.str.match('.*63796884.*|.*33631583.*',case=False)==True),'Cat'] = 'ServiceCharge'
    
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*LAND ROVER.*')==True),"Cat"] = 'Car'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('DIRECTDEBIT|Direct Debit')==True),'Cat'] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('NATIONWIDE|KINGSTON UNITY')==True),'Cat'] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*UNITY MUTUAL.*')==True),"Cat"] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*Valentino.*car.*')==True),"Cat"] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('REPEATPMT|Standing Order',case=False)==True)&(dfB.Memo.str.match('.*M TUCKER.*STO.*')==True)&(dfB.Amount>-100),'Cat'] = 'RegularPayment'

    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='FT|Funds Transfer')&(dfB.Memo.str.match('Sofia|Rocco', case=False)==True),'Cat'] = 'PersonalExpense'

    dfB.loc[(dfB.Cat.isnull())&(dfB.Amount>0),'Cat'] = 'OtherIncome'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Amount<0),'Cat'] = 'OtherExpense'

    return dfB

def categorise_new_data(dfB):
  
    if 'Description' not in df.columns:
        df['Description']=''
    
    # Identify Mortgages
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('JASPER|TOPAZ|SIBERITE|SKIPTON|MORTGAGE EXPRESS|NRAM|PLATFORM|AMBER|BHAM|CAPITAL|CHL|MORTGAGE TRUST|PARAGON|HESSONITE')==True),"Cat"] = 'Mortgage'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*M TUCKER.*STO.*')==True) & (dfB.Amount > -200) & (dfB.Amount <-190),"Cat"] = 'Mortgage' # Kingston Rd mortgage
    
    # Identify Rents
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('^BEALS[ ]?ESTATE[ ]?AGENT.*$')==True),"Cat"] = 'BealsRent'
    dfB.loc[(dfB.Cat.isnull())&((dfB.Description.str.match('.*DEPOSIT.*|.*TDS.*',case=False)==True)|(dfB.Memo.str.match('.*TDS.*',case=False)==True)),"Cat"] = 'Deposit'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('REVENUE|Funds Transfer|Counter Credit|Standing Order|Bill Payment')==True)&(dfB.Memo.str.match('.*DEPOSIT.*|.*HampTerr Dep.*',case=False)==True),"Cat"] = 'Deposit'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('REVENUE')==True)&(dfB.Property!=''),"Cat"] = 'OurRent'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Funds Transfer|Counter Credit|Standing Order|Bill Payment')==True)&(dfB.Memo.str.match('.*RENT.*|.*KUMAR.*|.*LINDEMERE.*|.*SEQUENCE UK.*|.*SOPHIE.*|.*BETTS.*|.*RAMOS.*',case=False)==True),"Cat"] = 'OurRent'
    #dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Funds Transfer|Counter Credit|Standing Order|Bill Payment')==True)&(dfB.Memo.str.match('.*RENT.*|.*KUMAR.*|.*LINDEMERE.*|.*SEQUENCE UK.*|.*SOPHIE.*|.*BETTS.*',case=False)==True),"Cat"] = 'OurRent'
    
    # Identify Property Expenses
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('WORKPLACE|Bill Payment|Funds Transfer|Standing Order')==True)&(dfB.Memo.str.match('.*PORTSEA.*|.*BECK.*|.*COURT FEE.*|.*ROGERS.*|.*ICE PROFESSIONAL.*|.*SOUTHERN ELEC.*|.*Just Answer.*|.*SSE.*|.*OVO.*')==True),"Cat"] = 'PropertyExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('ADMIN')==True),"Cat"] = 'PropertyExpense'

    # Identify Service Charges and Transfers between accounts
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('23 HAMPSHIRE.*STO|4-6 ALHAMBRA RD CS|12-14 ALHAMBRA RD|16-18 ALHAMBRA RD|ALHAMBRA ROAD MANA|Alhambra Road M Tucker SC|ALHHAMBRA ROAD M Tucker SC|Lordswood Estates M Tucker SC')==True),"Cat"] = 'ServiceCharge'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*30728691.*')==True),'Cat'] = 'Funds3072'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*40406538 .*')==True),'Cat'] = 'Funds4040'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*60458872.*')==True),'Cat'] = 'Funds6045'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Standing Order|INTEREST_PAYMENTS')==True)&(dfB.Memo.str.match('.*Mortgages.*',case=False)==True),"Cat"] = 'MortgageRefund'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('DIRECTORS_WAGES|directors wages')==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Bill Payment')&(dfB.Amount<0)&(dfB.Memo.str.match('RSA CAPITAL')==True),"Cat"] = 'OurRent'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Funds Transfer')==True)&(dfB.Memo.str.match('.*FRATTON SC.*|.*FRATTON ROAD.*|.*FRATTON RD.*|.*CREST.*')==True),"Cat"] = 'FrattonRoad'

    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('REPAIRS_AND_MAINTENANCE')==True),"Cat"] = 'PropertyExpense'
    
    # Identify Regular Payments
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*HARPUR TRUST.*|.*BEDFORD SCHOOL.*')==True),"Cat"] = 'SchoolFee'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('G[ ]?[&+][ ]?F[ ]?VALENTINO.*')==True),"Cat"] = 'Hilltop'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Bill Payment')==True)&(dfB.Memo.str.match('.*HMRC*.')==True),"Cat"] = 'HMRC'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Direct Debit')==True)&(dfB.Memo.str.match('.*HMRC*.')==True),"Cat"] = 'HMRCDD'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('.*LAND ROVER.*')==True),"Cat"] = 'Car'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('Standing Order|Direct Debit')==True)&(dfB.Memo.str.match('NATIONWIDE|KINGSTON UNITY')==True),'Cat'] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('Spotify')==True),'Cat'] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Direct Debit'),'Cat'] = 'RegularPayment'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Standing Order')&(dfB.Memo.str.match('.*M TUCKER.*STO.*')==True)&(dfB.Amount>-100),'Cat'] = 'RegularPayment' # £50 to 1585, £5 to Natwest
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Card Purchase')&(dfB.Memo.str.match('.*Amazon Prime*',case=False)==True)&(dfB.Amount==-7.99),'Cat'] = 'RegularPayment'
    
    # Mark rest of card puchases as Personal Expense
    
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory.str.match('.*Card Purchase.*|.*Card Refund.*')==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Cash Withdrawal'),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='Bill Payment'),'Cat'] = 'PersonalExpense'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Subcategory=='PERSONAL'),'Cat'] = 'PersonalExpense'
    
    dfB.loc[(dfB.Cat.isnull())&(dfB.Amount>0),'Cat'] = 'OtherIncome'
    dfB.loc[(dfB.Cat.isnull())&(dfB.Amount<0),'Cat'] = 'OtherExpense'
    
    return dfB

def categorise_personal_spending(dfB):
    dfB["Subcat"] = np.nan
    
    # BP Garage and other service stations
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('^BP[ ].*|.*NEW COUNTY SERVICE.*|.*THE GARAGE.*|.*SHELL.*|.*MFG CHILDS WAY.*|.*MORRISONS PETRO.*',case=False)==True),'Subcat']='Garage'

    # Food
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*TESCO.*',case=False)==True),'Subcat']='Tesco'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*MARKS&SPENCER.*|.*SIMPLY[ ]?FOOD.*',case=False)==True),'Subcat']='M&S'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*WAITROSE.*',case=False)==True),'Subcat']='Waitrose'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*W[ ]?M[ ]?MORRISON.*',case=False)==True),'Subcat']='Morrisons'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*LIDL.*|.*ALDI.*',case=False)==True),'Subcat']='LIDL'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*CO[-]?OP.*|.*CO[ ]?OP.*',case=False)==True),'Subcat']='COOP'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BUDGENS.*|.*COSTCUTTER.*',case=False)==True),'Subcat']='Budgens'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*COSTCO.*',case=False)==True),'Subcat']='Costco'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*A1[ ]?Foods.*',case=False)==True),'Subcat']='A1 Foods'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*SAINSBURY.*',case=False)==True),'Subcat']='Sainsburys'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*ASDA.*',case=False)==True),'Subcat']='ASDA'

    # Chemist
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BOOTS.*|.*SUPERDRUG.*|.*PHARMACY.*|.*THE HIGHLANDS PHAR.*|.*HOLLAND.*|.*Real Health.*|.*SPECSAVERS.*|.*DENTAL.*|.*selectspecs.*|.*VISION DIRECT.*|.*ISHADE OPTI.*',case=False)==True),'Subcat']='Pharmacy/Opticians/Dental'

    # Beauty
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*HARVEY NICHOLS.*|.*LIBERTY.*|.*HARRODS.*|.*Penhaligons.*|.*SELFRIDGES.*|.*LOOKFANTASTIC.*|.*SALLYSALONSERVICES.*|.*RICHY.*|.*C D 4 U.*|.*NAILS.*|.*PHELANS.*',case=False)==True),'Subcat']='Beauty'
    
    # Fast Food and Coffee
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*MCDONALDS.*|.*BURGER.*|.*FIVE GUYS.*|.*BUBBLE.*|.*CHURROS.*|.*NANDOS.*|.*WAGAMAMA.*|.*BBTEA.*|.*tandoori.*|.*PRET.*|.*MILLIES COOKIES.*|.*GREGGS.*|.*PIZZA.*|.*PRETZELS.*|.*KOKORO.*|.*Whippy.*|.*CHICKEN[ ]?GEORGE.*|.*KFC.*|.*MILTON KEYNES FOOD.*|.*SHAKEAWAY.*|.*KINGS ARMS.*|.*FRANKIE.*BENNYS.*|.*FOURTH.*FIFTH.*|.*FROSTS.*|.*GREGGS.*|.*CHIQUITO.*|.*CHIMICHANGA.*|.*CHOPSTIX.*|.*LAKESIDE FISH.*|.*JUST-EAT.*|.*FOXY[ ]?WINGS.*',case=False)==True),'Subcat']='EatingOut'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Costa.*|.*PAVILION.*|.*Air-Serv.*|.*STARBUCKS.*|.*MILTON_KEYNES_PARK.*|.*SUBWAY.*|.*COFFEE.*|.*Espresso.*|.*D.*PARYS.*|.*Express Vend.*|.*MILTON KEYNES PARK.*',case=False)==True),'Subcat']='Coffee'

    # Household
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*NEXT.*|.*SPORTSDIRECT.*|.*T K MAXX.*|.*CLARKS.*|.*FOOT LOCKER.*|.*MATALAN.*|.*AMBROSE.*|.*MOTELROCKS.*|.*ACCESSORI.*|.*BOUX AVENUE.*|.*URBANOUTFITTERS.*|.*HOLLISTER.*|.*KAREN MILLEN.*|.*ETSY.*|.*NO LIMITZ.*|.*wbys.*|.*RIVER[ ]?ISLAND.*|.*HUGO[ ]?BOSS.*|.*FOOT LOCKER.*|.*OUTFIT.*|.*SCHOOLBLAZER.*|.*FASHION.*|.*PRIMARK.*|.*Schuh.*|.*Superdry.*|.*Zara.*|.*Nike.*|.*Cupshe.*|.*Moss.*|.*OH POLLY.*|.*Good Belly.*|.*MINT VELV.*|.*Vestiaire.*|.*ASOS.*|.*Trutex.*|.*SchoolUniform.*',case=False)==True),'Subcat']='Clothing'
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*WILKO.*|.*B&M.*|.*CURRYS.*|.*THE RANGE.*|.*PETS.*|.*Hobbycraft.*|.*HOME BARGAINS.*|.*WH SMITH.*|.*VETERINARY.*|.*WICKES.*|.*HOMESENSE.*|.*ARGOS.*|.*POUNDLAND.*|.*JOHN[ ]?LEWIS.*|.*TIMPSON.*|.*DUNELM.*|.*HOMEBASE.*|.*IKEA.*|.*B[ ]?&[ ]?Q.*',case=False)==True),'Subcat']='Household'

    # Car
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*HALFORDS.*|.*MILLENNIUM.*|.*MERCEDE.*|.*INSURANCE.*|.*TYRE STORE.*|.*VEHICLE.*|.*A AND F MOTORS.*|.*JOHN R FORD.*',case=False)==True),'Subcat']='Car'
    
    # Entertainment
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Amazon[ ]?Prime.*|.*cinema.*|.*vue.*|.*Microsoft.*|.*Spotify.*|.*NOW.*|.*GAME.*',case=False)==True),'Subcat']='Amazon'
    
    # Amazon
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Amazon.*|.*AMZNMktplace.*|.*AMZ.*',case=False)==True),'Subcat']='Amazon'

    # Cash
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Subcategory.str.match('.*CASH.*',case=False)==True),'Subcat']='Cash'
    
    # Other
    dfB.loc[(dfB.Memo.str.match('.*SEQUENCE.*',case=False)==True),"Cat"] = 'PropertyExpense'
    
    dfB.loc[(dfB.Memo.str.match('.*MARKS&SPENCER.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*MARKS&SPENCER.*',case=False)==True),'Subcat']='M&S'
    
    dfB.loc[(dfB.Memo.str.match('.*Chiropractic.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Chiropractic.*',case=False)==True),'Subcat']='Rocco'
    
    dfB.loc[(dfB.Memo.str.match('.*Thameslink.*|.*TSGN.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Thameslink.*|.*TSGN.*',case=False)==True),'Subcat']='Rocco'
    
    dfB.loc[(dfB.Memo.str.match('.*New County.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*New County.*',case=False)==True),'Subcat']='Garage'
    
    dfB.loc[(dfB.Memo.str.match('.*Rocco.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Rocco.*',case=False)==True),'Subcat']='Rocco'
    
    dfB.loc[(dfB.Memo.str.match('.*Alessio.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Alessio.*',case=False)==True),'Subcat']='Alessio'
    
    dfB.loc[(dfB.Memo.str.match('.*Sofia.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Sofia.*',case=False)==True),'Subcat']='Sofia'
    
    dfB.loc[(dfB.Memo.str.match('.*Tesco.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Tesco.*',case=False)==True),'Subcat']='Tesco'
    
    dfB.loc[(dfB.Memo.str.match('.*Shell.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Shell.*',case=False)==True),'Subcat']='Garage'
    
    dfB.loc[(dfB.Memo.str.match('.*Lidl.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Lidl.*',case=False)==True),'Subcat']='LIDL'
    
    dfB.loc[(dfB.Memo.str.match('.*HOME BARGAINS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*HOME BARGAINS.*',case=False)==True),'Subcat']='Household'
    
    dfB.loc[(dfB.Memo.str.match('.*JD Sports.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*JD Sports.*',case=False)==True),'Subcat']='Clothing'
    
    dfB.loc[(dfB.Memo.str.match('.*Sports Direct.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Sports Direct.*',case=False)==True),'Subcat']='Clothing'
    
    dfB.loc[(dfB.Memo.str.match('.*BOOTS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BOOTS.*',case=False)==True),'Subcat']='Pharmacy/Opticians/Dental'
    
    dfB.loc[(dfB.Memo.str.match('.*Sainsbury.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Sainsbury.*',case=False)==True),'Subcat']='Sainsburys'
    
    dfB.loc[(dfB.Memo.str.match('.*INTEREST CHARGED.*',case=False)==True),"Cat"] = 'OtherExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*INTEREST CHARGED.*',case=False)==True),'Subcat']='Bank'
    
    dfB.loc[(dfB.Memo.str.match('.*BEDFORD EXPRESS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BEDFORD EXPRESS.*',case=False)==True),'Subcat']='Garage'
    
    dfB.loc[(dfB.Memo.str.match('.*UNITY MUTUAL.*',case=False)==True),"Cat"] = 'RegularPayment'
    
    dfB.loc[(dfB.Memo.str.match('.*ALDI .*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*ALDI .*',case=False)==True),'Subcat']='ALDI'
    
    dfB.loc[(dfB.Memo.str.match('.*THE HUB DENTAL.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*THE HUB DENTAL.*',case=False)==True),'Subcat']='Pharmacy/Opticians/Dental'
    
    dfB.loc[(dfB.Memo.str.match('.*HIGHLANDS PHARMACY.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*HIGHLANDS PHARMACY.*',case=False)==True),'Subcat']='Pharmacy/Opticians/Dental'
    
    dfB.loc[(dfB.Memo.str.match('.*HOMESENSE.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*HOMESENSE.*',case=False)==True),'Subcat']='Household'
    
    dfB.loc[(dfB.Memo.str.match('.*NEXT RETAIL.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*NEXT RETAIL.*',case=False)==True),'Subcat']='Clothing'
    
    dfB.loc[(dfB.Memo.str.match('.*TK Maxx.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*TK Maxx.*',case=False)==True),'Subcat']='Clothing'
    
    dfB.loc[(dfB.Memo.str.match('.*PAY AT PUMP.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*PAY AT PUMP.*',case=False)==True),'Subcat']='Garage'
    
    dfB.loc[(dfB.Memo.str.match('.*OPERATIVE FOOD.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*OPERATIVE FOOD.*',case=False)==True),'Subcat']='COOP'
    
    dfB.loc[(dfB.Memo.str.match('.*BP BP.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BP BP.*',case=False)==True),'Subcat']='Garage'
    
    dfB.loc[(dfB.Memo.str.match('.*Velvet.*|.*Mango.*|.*Primark.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Velvet.*|.*Mango.*|.*Primark.*',case=False)==True),'Subcat']='Clothing'
    
    dfB.loc[(dfB.Memo.str.match('.*MK COUNCIL.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*MK COUNCIL.*',case=False)==True),'Subcat']='Other'
            
    dfB.loc[(dfB.Memo.str.match('.*SPECSAVERS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*SPECSAVERS.*',case=False)==True),'Subcat']='Pharmacy/Opticians/Dental'

    dfB.loc[(dfB.Memo.str.match('.*ATM.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*ATM.*',case=False)==True),'Subcat']='Cash'
    
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Ivana VALENTINO.*',case=False)==True),'Subcat']='Ivana'
    
    dfB.loc[(dfB.Memo.str.match('.*Wagamama.*|.*Costa.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Wagamama.*|.*Costa.*',case=False)==True),'Subcat']='EatingOut'
    
    dfB.loc[(dfB.Memo.str.match('.*WAITROSE.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*WAITROSE.*',case=False)==True),'Subcat']='Waitrose'
    
    dfB.loc[(dfB.Memo.str.match('.*BUDGENS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*BUDGENS.*',case=False)==True),'Subcat']='Budgens'

    dfB.loc[(dfB.Memo.str.match('.*MORRISONS.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*MORRISONS.*',case=False)==True),'Subcat']='Morrisons'
    
    dfB.loc[(dfB.Memo.str.match('.*Ivana Valentino car.*',case=False)==True),"Cat"] = 'RegularPayment'
    dfB.loc[(dfB.Cat=='RegularPayment')&(dfB.Memo.str.match('.*Ivana Valentino car .*',case=False)==True),'Subcat']='IVCar'
        
    dfB.loc[(dfB.Memo.str.match('.*Bedford Hospital.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Bedford Hospital .*',case=False)==True),'Subcat']='Other'
    
    dfB.loc[(dfB.Memo.str.match('.*Ivana.*VALENTINO.*food.*|.*Ivana.*VALENTINO.*trf.*',case=False)==True),"Cat"] = 'PersonalExpense'
    dfB.loc[(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('.*Ivana.*VALENTINO.*food.*|.*Ivana.*VALENTINO.*trf.*',case=False)==True),'Subcat']='Ivana'
    
    dfB.loc[(dfB.Memo.str.match('.*Rsa Capital Limite.*',case=False)==True),"Cat"] = 'Interbank'
    dfB.loc[(dfB.Memo.str.match('.*M Tucker.*BGC.*',case=False)==True),"Cat"] = 'MTPayment'

    dfB.loc[(dfB.Memo.str.match('.*TOGETHER COMMERCIA.*',case=False)==True),"Cat"] = 'RegularPayment'
    dfB.loc[(dfB.Cat=='RegularPayment')&(dfB.Memo.str.match('.*TOGETHER COMMERCIA.*',case=False)==True),'Subcat']='SFLoan'
    
    #propertyexpense where property id is null
    
    # Catch all
    dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense'),'Subcat']='Other'
    
    return dfB

### Categorise date for files between start/end dates

- input files are 'MMMYYYY_coded.csv'
- adds category column by applying categories to each transaction
- the output file 'MMMYYYY_codedAndCategorised.csv' should be checked to ensure categories are correct

In [4]:
rsaCapitalDate = datetime.date(2022,8,1) # Start date of RSA Capital Ltd

dfAll= pd.DataFrame()
dates = [dt for dt in rrule(MONTHLY, dtstart=start, until=end)]
for date in dates:
    dateStr= date.strftime("%b").upper() + date.strftime("%Y")  
    input_file=generated_path + dateStr + '_coded.csv'
    print('Processing: ' + input_file)
    if not os.path.isfile(input_file):
        print('Warning missing file: ' + input_file)
        continue
    df=pd.read_csv(input_file, index_col=0, parse_dates=True, dayfirst=True)
    if 'Cat' not in df.columns:
        df['Cat']=np.nan
    # Use new categorisation method if working with new data
    if(date.date()>=rsaCapitalDate):
        df=categorise_new_data(df)
    else:
        df=categorise_old_data(df)
    df=categorise_personal_spending(df)
    file=backup_file(generated_path + dateStr + '_codedAndCategorised.csv')
    df.to_csv(file)
    dfAll=pd.concat([dfAll,df])
    
# Use this to check categorisation
dfAll.to_csv('CatCheck.csv')    

Processing: J://My Drive//NAS//My Documents//Business//Property//Statements//working//python//data//property//generated//DEC2025_coded.csv


  dfB.loc[(dfB.Cat.isnull())&(dfB.Memo.str.match('JASPER|TOPAZ|SIBERITE|SKIPTON|MORTGAGE EXPRESS|NRAM|PLATFORM|AMBER|BHAM|CAPITAL|CHL|MORTGAGE TRUST|PARAGON|HESSONITE')==True),"Cat"] = 'Mortgage'
  dfB.loc[(dfB.Subcat.isnull())&(dfB.Cat=='PersonalExpense')&(dfB.Memo.str.match('^BP[ ].*|.*NEW COUNTY SERVICE.*|.*THE GARAGE.*|.*SHELL.*|.*MFG CHILDS WAY.*|.*MORRISONS PETRO.*',case=False)==True),'Subcat']='Garage'


### Now manually check categorisation

- open each file and save xls into /checked folder
- go through each Cat
- sum of Funds6045, Funds3072 should equal 0
- check Mortgages, BealsRents and OurRents all allocated to correct properties
- check PropertyExpenses are allocated correctly
- check Amazon payments
- mark Drawings and MortgageRefunds categories in all accounts (i.e. use Drawings for money taken out of RsaCapital *and* money paid into 4040 - need to filter on Drawings + Account if want to know money taken from RSA)