In [2]:
from pathlib import Path

import pandas as pd
import numpy as np

## Categories
1. Housing

- Rent/Mortgage
- Property Taxes
- Home Insurance
- Maintenance/Repairs
- Utilities (Electricity, Water, Gas)

2. Transportation
- Car Payments/Leasing
- Fuel
- Insurance
- Public Transportation
- Maintenance and Repairs
- Parking Fees/Tolls

3. Food & Groceries
- Groceries
- Dining Out
- Coffee/Snacks
- Meal Delivery Services

4. Health & Wellness

Health Insurance
Medical Bills (Doctor, Dentist, etc.)
Medications
Fitness (Gym Memberships, Classes)
Personal Care (Haircuts, Skincare)

5. Entertainment & Leisure

Streaming Subscriptions
Movies, Concerts, Events
Books, Magazines
Hobbies (Music, Crafts, etc.)
Travel/Vacations

6. Personal & Family Care

Childcare/Schooling
Pet Care (Food, Vet)
Clothing & Accessories
Gifts & Special Occasions

7. Insurance

Life Insurance
Disability Insurance
Home/Renters Insurance
Vehicle Insurance

8. Debt Repayment

Credit Card Payments
Student Loans
Personal Loans
Mortgage

9. Savings & Investments

Emergency Fund
Retirement Contributions
Stocks/Mutual Funds
Savings Accounts

10. Miscellaneous

Donations/Charity
Professional Services (Legal, Accounting)
Miscellaneous Fees/Bank Fees
Unexpected/One-time Expenses

In [3]:
DATA_DIR = Path('data')
FNAME = list(DATA_DIR.glob('*.csv'))[0]

In [4]:
header = ['Date', 'Exe Date', 'Title', 'Vendor', 'Account', 'Amount', 'Balance', '']
df = pd.read_csv(FNAME, skiprows=1, names=header, index_col=7, usecols=range(8))
df.Vendor = df.Vendor.astype(str)
df.Amount = df.Amount.str.replace(',', '.').astype(float) * -1
df.Balance = df.Balance.str.replace(',', '.').astype(float)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
DATA = df.round(2)

In [5]:
pd.options.display.max_rows = 200
pd.set_option('display.max_colwidth', None)

In [69]:
CATEGORIES = {
  'income'        : ['citibank', 'mishchenko'],
  'lokaty'        : ['lokaty'],
  'blik'          : ['blik'],
  'constr'        : ['leroy-merlin', 'market obi', 'elektry', '302/24/2'],
  'groc'          : ['biedronka', 'lidl', 'aldi', 'zabka', 'spolem', 'kaufland', 'netto', 'delikatesy', 'kiosk', 'sklepik', 'lewiatan', 'food', 'transgourmet', 'kartofelek',
                     'za królika', 'el-mi', 'dom wina'],
  'drug'          : ['rossmann', 'dm-drog', 'apteka', 'dom lek'],
  'bakery'        : ['piekarnia', 'familijna', 'cukiernia', 'rozana', 'coffee', 'rogalik', 'cafe', 'puri', 'tandyr', 'paczkarnia', 'precel', 'helen', 'kawiarnia', 'bombonierka',
                     'glodny i zly', 'sie upieklo'],
  'ice_cream'     : ['lodziarni', 'ice sweet', 'roma', 'agawa', 'lody', 'galeria wroclav'],
  'restaurants'   : ['pedet', 'padet', 'ekushe', 'mcdonalds', 'chinski', 'gordito', 'pod papu', 'aston', 'restauracja', 'kebab', 'kocur', 'pizza', 'ramen', 'pasibus', 'sevi',
                     'pierogarnia', 'gastro', 'burger', 'pinta', 'taormina', 'kedzierzyn-ko', 'wino spod', 'lissa', 'kinga danielczak', 'p turawa', 'polana', 'kalimasu', 
                     'shauman', 'viet-thai', 'czerwone sombrero', 'oliwa i ogien', 'ogrodek letni'],
  'clothing'      : ['deichmann', 'sinsay'],
  'car'           : ['rentalcars', 'uber', 'bolt', 'orlen', 'bp-lena', 'paliw', 'parking', 'circle k', 'carsharing', 'mol s'],
  'train'         : ['koleje', 'pkp'],
  'pt'            : ['urbancard', 'statek dla'],
  'consumer_goods': ['amazon', 'allegro', 'olx', 'free look', 'duka', 'a386', 'pepco', 'ikea', 'tiger', 'empik'],
  'hotel'         : ['hotel', 'resort', 'altus', 'reso'],
  'culture'       : ['teatr', 'forum muzyki', 'zoo', 'museum', 'muzeum'],
  'entertainment' : ['przystan zwierzyniecka'],
  'atm'           : ['bankomat'],
  'igi'           : ['lego', 'smyk', 'park wodn', 'igor', 'kolejkowo', 'pirat automaty', 'funpark', 'krakowiany'],
  'child_care'    : ['fantazja'],
  'child_care_a'  : ['64 1600 1475 3300 3009 6491 0000'],
  'personal_care' : ['piekna', 'fryzjer', 'hair'],
  'utilities'     : ['wspólnota miesz', 'marpis'],
  'utilities_t'   : ['opłaty zaliczka', 'wodomierza'],
  'housing'       : ['wpo-dnf.3120', 'podatek od nieruchomości'],
  'insurance'     : ['compensa'],
  'fees'          : ['opłata za wznowienie', 'przewalutowanie'],
  'garden'        : ['benex', 'rod odra', 'świat kwiatów'],
}
EXCLUDE = ['Present Kamil', 'present konrad', 'present natalia', 'Natalias farewell', 'Reichmann 26012', 'have fun', 'test', '2405090834092305', '1912184', 'P24-J7G-G4V-C1F']
EXCLUDE_A = ['87 1140 2004 0000 3402 3476 9350', '50 1020 5558 1111 1044 6800 0087']
ALL = sum(CATEGORIES.values(), start=['blik'])


In [70]:
'OGRODEK LETNI'.lower()

'ogrodek letni'

In [71]:
df = DATA.copy()
df = df.fillna('')

df = df[~df.Title.str.lower().str.contains('|'.join(x.lower() for x in EXCLUDE))]
df = df[~df.Account.str.lower().str.contains('|'.join(x.lower() for x in EXCLUDE_A))]

a = pd.DataFrame(index=pd.MultiIndex.from_tuples([(y, m) for y in [2023, 2024] for m in range(1, 13)] + [('SUM', '')], names=['Year', 'Month']))
for cat, vals in CATEGORIES.items():
  col = ['Title', 'Vendor'][cat in ['income', 'utilities', 'child_care', 'garden', 'insurance']]
  col = 'Account' if cat in ['child_care_a'] else col
  cut = df[col].str.lower().str.contains('|'.join(vals))
  df_ = df[cut]
  a[cat.upper()] = df_.groupby([df_['Date'].dt.year, df_['Date'].dt.month]).Amount.sum()
  df = df[~cut]  # remove to avoid double counting
a['OTHER'] = df.groupby([df['Date'].dt.year, df['Date'].dt.month]).Amount.sum()
a.loc[('SUM', '')] = a.sum()
a = a.fillna(0).iloc[3:]

In [72]:
df = df.copy()
c0, c1, c2, c3, c4 = np.ones((5, len(df)))
# c0 = df.Title.str.lower().str.contains('kolej')
# c0 = df.Title.str.lower().str.contains('|'.join(CATEGORIES['groc']))
# c0 = ~df.Title.str.lower().str.contains('|'.join(ALL))
# c1 = df.Vendor == 'nan'
# c1 = df.Date.dt.month == 1
# c2 = df.Date.dt.year == 2024
# c3 = df.Amount > 0
c = np.all([c0, c1, c2, c3, c4], axis=0)
# df[c][['Date', 'Amount', 'Vendor']]
# if len(df[c]):
dfc = df[c]
a = dfc.groupby([dfc['Date'].dt.year, dfc['Date'].dt.month]).Amount.sum()
# dfc.groupby(dfc['Date'].dt.month).Amount.sum()
# dfc.Amount.sum()
# cols = ['Date', 'Title', 'Vendor', 'Amount']
print(len(dfc))
dfc.iloc[0:200]
# a

84


Unnamed: 0,Date,Exe Date,Title,Vendor,Account,Amount,Balance
,,,,,,,
209.0,2024-08-05,04-08-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 6.00 PLN DBFLORIST WROCLAW,,,6.0,4006.5
212.0,2024-08-04,03-08-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 13.00 PLN BISTROCYKL WROCLAW,,,13.0,2182.5
244.0,2024-07-22,21-07-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 9.00 PLN FoRest Wroclaw,,,9.0,2348.18
304.0,2024-06-30,29-06-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 12.00 PLN Wroclawskie Przedsiebi Wroclaw,,,12.0,1781.64
433.0,2024-05-02,01-05-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 59.00 PLN PTT INVEST Sp. z o.o. Krakow,,,59.0,369.54
434.0,2024-05-02,01-05-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 16.00 PLN PTT INVEST Sp. z o.o. Krakow,,,16.0,428.54
435.0,2024-05-02,01-05-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 80.00 PLN MTP Poznan,,,80.0,485.34
438.0,2024-05-01,30-04-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 56.00 PLN TRAKT POZNAN,,,56.0,709.34
441.0,2024-04-30,29-04-2024,DOP. MC 557519******1718 PŁATNOŚĆ KARTĄ 30.00 PLN MPK POZNAN STACJA MALT POZNAN,,,30.0,1242.31
