Skip to content

Commit

Permalink
Support pasting transactions in terminal (#22)
Browse files Browse the repository at this point in the history
* Support TSVs

* Log duplicate transactions instead of raising error

* Fix tests

* Reduce logging level for na rows

* Support input from stdin

* Add input format BMO_CC_WEB

* Add BMO_CC_ADOBE input format

* Check input more comprehensively
  • Loading branch information
patrick-5546 authored Jan 5, 2024
1 parent 3c52dac commit 1ff33c4
Show file tree
Hide file tree
Showing 5 changed files with 235 additions and 37 deletions.
64 changes: 48 additions & 16 deletions src/xlbudget/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
from abc import ABC, abstractmethod
from argparse import ArgumentParser, Namespace, _SubParsersAction
from logging import getLogger
from typing import List, Type
from typing import List, Optional, Type

from openpyxl import Workbook, load_workbook

from xlbudget.inputformat import GetInputFormats, parse_input
from xlbudget.inputformat import GetInputFormats, InputFormat, parse_input
from xlbudget.rwxlb import update_xlbudget

logger = getLogger(__name__)
Expand Down Expand Up @@ -124,8 +124,10 @@ class Update(Command):
aliases (List[str]): The command's CLI aliases.
Attributes:
input (str): The path to the input file.
input (Optional[str]): The path to the input file, otherwise paste in terminal.
format (inputformat.InputFormat): The input file format.
year (Optional[str]): The year all transactions were made, only relevant if
the input format is 'BMO_CC_ADOBE'.
"""

name: str = "update"
Expand All @@ -146,44 +148,74 @@ def configure_args(cls, subparsers: _SubParsersAction) -> None:
cmd_cls=Update,
)

parser.add_argument("input", help="path to the input file")
# required arguments
parser.add_argument(
"format",
action=GetInputFormats,
choices=GetInputFormats.input_formats.keys(),
help="select an input file format",
help="select an input format",
)

# optional arguments
parser.add_argument("-i", "--input", help="path to the input file")
parser.add_argument(
"-y",
"--year",
help="year that all transactions were made, only relevant if input format "
"is 'BMO_CC_ADOBE'",
)

def __init__(self, args: Namespace) -> None:
super().__init__(args)

self._check_input(args.input)
self._check_input(args.input, args.format, args.year)
self.input = args.input
self.format = args.format
self.year = args.year

logger.debug(f"instance variables: {vars(self)}")

@staticmethod
def _check_input(input: str) -> None:
"""Check that `input` is a valid path to an input file.
def _check_input(
input: Optional[str], input_format: Optional[InputFormat], year: Optional[str]
) -> None:
"""Check that `input` and `year` are valid.
Args:
input (str): The input path.
input (Optional[str]): The input path.
input_format (Optional[InputFormat]): The input format.
year (Optional[str]): The year of all transactions.
Raises:
ValueError: If `input` is not a CSV file.
ValueError: If `input` is not an existing file.
ValueError: If `input` is not None and the wrong file extension or DNE.
ValueError: If `year` is None when `input_format` is 'BMO_CC_ADOBE'.
"""
csv_ext = ".csv"
if not input.endswith(csv_ext):
raise ValueError(f"Input '{input}' does not end with '{csv_ext}'")
if input is None:
return

in_ext = (".csv", ".tsv", ".txt")
if not input.endswith(in_ext):
raise ValueError(f"Input '{input}' does not end with one of '{in_ext}'")

if not os.path.isfile(input):
raise ValueError(f"Input '{input}' is not an existing file")

# get key from value: https://stackoverflow.com/a/13149770
if input_format is not None:
# validate year
format = list(GetInputFormats.input_formats.keys())[
list(GetInputFormats.input_formats.values()).index(input_format)
]
if format == "BMO_CC_ADOBE" and year is None:
raise ValueError(f"Must specify 'year' argument when {format=}")

# validate input file type in more detail
if input_format.seperator == "\t" and not input.endswith(".tsv"):
raise ValueError(f"Input file should be TSV for {format=}")

def run(self) -> None:
logger.info(f"Parsing input file {self.input}")
df = parse_input(self.input, self.format)
logger.info(f"Parsing input {self.input}")
df = parse_input(self.input, self.format, self.year)
logger.debug(f"input file: {df.shape=}, df.dtypes=\n{df.dtypes}")
logger.debug(f"df.head()=\n{df.head()}")

Expand Down
200 changes: 183 additions & 17 deletions src/xlbudget/inputformat.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,137 @@
"""Input file format definitions."""

import io
import sys
from argparse import Action
from typing import Dict, List, NamedTuple
from logging import getLogger
from typing import Callable, Dict, List, NamedTuple, Optional

import numpy as np
import pandas as pd

from xlbudget.rwxlb import COLUMNS, df_drop_ignores, df_drop_na

logger = getLogger(__name__)


class InputFormat(NamedTuple):
"""Specifies the format of the input file.
Attributes:
header (int): The 0-indexed row of the header in the input file.
names (List[str]): The column names.
usecols (List[int]): The indices of columns that map to `COLUMNS`.
usecols (List[int]): The first len(`COLUMNS`) elements are indices of columns
that map to `COLUMNS`, there may indices after for columns required for
post-processing.
ignores (List[str]): Ignore transactions that contain with these regex patterns.
pre_processing (Callable): The function to call before `pd.read_csv()`.
post_processing (Callable): The function to call after `pd.read_csv()`.
sep (str): The separator.
"""

header: int
names: List[str]
usecols: List[int]
ignores: List[str]
pre_processing: Callable = lambda input, _: input
post_processing: Callable = lambda df: df
seperator: str = ","

def get_usecols_names(self):
return [self.names[i] for i in self.usecols]
return [self.names[i] for i in self.usecols[:3]]


# define pre-processing functions below


def bmo_cc_adobe_pre_processing(_input: Optional[str], year: str) -> io.StringIO:
"""Create CSV from input with each element on a new line.
Args:
_input (Optional[str]): The file to process, if `None` then read from stdin.
year (str): The year of all transactions.
Returns:
A[n] `io.StringIO` CSV.
"""
# get lines from stdin or file
if _input is None:
lines = []
for line in sys.stdin:
lines.append(line.strip())
else:
with open(_input) as f:
lines = f.read().splitlines()

rows = []
i = 0
while i < len(lines):
elements = lines[i : i + 4]

# add year to dates
elements[0] += f" {year}"
elements[1] += f" {year}"

# add negative sign to amounts that are not credited (CR on next line)
if i + 4 < len(lines) and lines[i + 4] == "CR":
i += 5
else:
# check if amount is a float (header will not be float)
is_float = True
try:
float(elements[-1])
except ValueError:
is_float = False

if is_float:
elements[-1] = "-" + elements[-1]

i += 4

row = ",".join(elements) + "\n"
rows.append(row)

new_input = "".join(rows)
return io.StringIO(new_input)


# define post-processing functions below


def bmo_acct_web_post_processing(df: pd.DataFrame) -> pd.DataFrame:
"""Creates the "Amount" column.
Args:
df (pd.DataFrame): The dataframe to process.
Returns:
A[n] `pd.DataFrame` that combines "Amount" and "Money in" to create "Amount".
"""
df["Amount"] = df["Amount"].replace("[$,]", "", regex=True).astype(float)
df["Money in"] = df["Money in"].replace("[$,]", "", regex=True).astype(float)
df["Amount"] = np.where(df["Money in"].isna(), df["Amount"], df["Money in"])
df = df.drop("Money in", axis=1)
return df


def bmo_cc_web_post_processing(df: pd.DataFrame) -> pd.DataFrame:
"""Formats the "Money in/out" column.
Args:
df (pd.DataFrame): The dataframe to process.
Returns:
A[n] `pd.DataFrame` that converts "Money in/out" to a float.
"""
df["Money in/out"] = (
df["Money in/out"].replace("[$,]", "", regex=True).astype(float)
)
return df


# define input formats below


BMO_ACCT = InputFormat(
header=3,
names=[
Expand All @@ -42,6 +145,21 @@ def get_usecols_names(self):
ignores=[r"^\[CW\] TF.*(?:285|593|625)$"],
)

BMO_ACCT_WEB = InputFormat(
header=0,
names=[
"Date",
"Description",
"Amount", # actually named "Money out", but matches after post-processing
"Money in",
"Balance",
],
usecols=[0, 1, 2, 3],
ignores=[r"^TF.*(?:285|593|625)$"],
post_processing=bmo_acct_web_post_processing,
seperator="\t",
)

BMO_CC = InputFormat(
header=2,
names=[
Expand All @@ -56,6 +174,32 @@ def get_usecols_names(self):
ignores=[r"^TRSF FROM.*285"],
)

BMO_CC_WEB = InputFormat(
header=0,
names=[
"Transaction date",
"Description",
"Money in/out",
],
usecols=[0, 1, 2],
ignores=[r"^TRSF FROM.*285"],
post_processing=bmo_cc_web_post_processing,
seperator="\t",
)

BMO_CC_ADOBE = InputFormat(
header=0,
names=[
"Transaction Date",
"Posting Date",
"Description",
"Amount",
],
usecols=[0, 2, 3],
ignores=[r"^TRSF FROM.*285"],
pre_processing=bmo_cc_adobe_pre_processing,
)


# define input formats above

Expand All @@ -76,34 +220,48 @@ def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, self.input_formats[values])


def parse_input(path: str, format: InputFormat) -> pd.DataFrame:
"""Parses an input file.
def parse_input(
input: Optional[str], format: InputFormat, year: Optional[str]
) -> pd.DataFrame:
"""Parses an input.
Args:
path (str): The path to the input file.
format (InputFormat): The input file format.
input (Optional[str]): The path to the input file, if None parse from stdin.
format (InputFormat): The input format.
year (Optional[str]): The year of all transactions.
Raises:
ValueError: If input file contains duplicate transactions.
ValueError: If input contains duplicate transactions.
Returns:
A[n] `pd.DataFrame` where the columns match the xlbudget file's column names.
"""
input_initially_none = input is None
if input_initially_none:
print("Paste your transactions here (CTRL+D twice on a blank line to end):")

input = format.pre_processing(input, year)

df = pd.read_csv(
path,
header=format.header,
input if input is not None else sys.stdin,
sep=format.seperator,
index_col=False,
names=format.names,
header=format.header if input is not None else None,
usecols=format.usecols,
parse_dates=[0],
skip_blank_lines=False,
)

df = df_drop_na(df)
if input_initially_none:
print("---End of transactions---")

# TODO: write issues to make ignoring duplicate transactions interactive
# they might not be an error
# TODO: investigate autocompletions
if df.duplicated().any():
raise ValueError("Input file contains duplicate transactions")
df = format.post_processing(df)

# convert first column to datetime and replace any invalid values with NaT
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], errors="coerce")

df = df_drop_na(df)

df.columns = df.columns.str.strip()

Expand All @@ -114,12 +272,20 @@ def parse_input(path: str, format: InputFormat) -> pd.DataFrame:
df = df.set_axis([c.name for c in COLUMNS], axis="columns")

# sort rows by date
df = df.sort_values(by="Date")
df = df.sort_values(by=list(df.columns), ascending=True)

# strip whitespace from descriptions
df["Description"] = df["Description"].str.strip()

# drop ignored transactions
df = df_drop_ignores(df, "|".join(format.ignores))

# TODO: write issues to make ignoring identical transactions interactive
# TODO: investigate autocompletions
if df.duplicated().any():
logger.warning(
"The following transactions are identical:\n"
f"{df[df.duplicated(keep=False)]}"
)

return df
Loading

0 comments on commit 1ff33c4

Please sign in to comment.