Support pasting transactions in terminal (#22)

* Support TSVs * Log duplicate transactions instead of raising error * Fix tests * Reduce logging level for na rows * Support input from stdin * Add input format BMO_CC_WEB * Add BMO_CC_ADOBE input format * Check input more comprehensively
patrick-5546 · Jan 5, 2024 · 1ff33c4 · 1ff33c4
1 parent 3c52dac
commit 1ff33c4
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 37 deletions.
diff --git a/src/xlbudget/commands.py b/src/xlbudget/commands.py
@@ -5,11 +5,11 @@
 from abc import ABC, abstractmethod
 from argparse import ArgumentParser, Namespace, _SubParsersAction
 from logging import getLogger
-from typing import List, Type
+from typing import List, Optional, Type
 
 from openpyxl import Workbook, load_workbook
 
-from xlbudget.inputformat import GetInputFormats, parse_input
+from xlbudget.inputformat import GetInputFormats, InputFormat, parse_input
 from xlbudget.rwxlb import update_xlbudget
 
 logger = getLogger(__name__)
@@ -124,8 +124,10 @@ class Update(Command):
         aliases (List[str]): The command's CLI aliases.
 
     Attributes:
-        input (str): The path to the input file.
+        input (Optional[str]): The path to the input file, otherwise paste in terminal.
         format (inputformat.InputFormat): The input file format.
+        year (Optional[str]): The year all transactions were made, only relevant if
+            the input format is 'BMO_CC_ADOBE'.
     """
 
     name: str = "update"
@@ -146,44 +148,74 @@ def configure_args(cls, subparsers: _SubParsersAction) -> None:
             cmd_cls=Update,
         )
 
-        parser.add_argument("input", help="path to the input file")
+        # required arguments
         parser.add_argument(
             "format",
             action=GetInputFormats,
             choices=GetInputFormats.input_formats.keys(),
-            help="select an input file format",
+            help="select an input format",
+        )
+
+        # optional arguments
+        parser.add_argument("-i", "--input", help="path to the input file")
+        parser.add_argument(
+            "-y",
+            "--year",
+            help="year that all transactions were made, only relevant if input format "
+            "is 'BMO_CC_ADOBE'",
         )
 
     def __init__(self, args: Namespace) -> None:
         super().__init__(args)
 
-        self._check_input(args.input)
+        self._check_input(args.input, args.format, args.year)
         self.input = args.input
         self.format = args.format
+        self.year = args.year
 
         logger.debug(f"instance variables: {vars(self)}")
 
     @staticmethod
-    def _check_input(input: str) -> None:
-        """Check that `input` is a valid path to an input file.
+    def _check_input(
+        input: Optional[str], input_format: Optional[InputFormat], year: Optional[str]
+    ) -> None:
+        """Check that `input` and `year` are valid.
 
         Args:
-            input (str): The input path.
+            input (Optional[str]): The input path.
+            input_format (Optional[InputFormat]): The input format.
+            year (Optional[str]): The year of all transactions.
 
         Raises:
-            ValueError: If `input` is not a CSV file.
-            ValueError: If `input` is not an existing file.
+            ValueError: If `input` is not None and the wrong file extension or DNE.
+            ValueError: If `year` is None when `input_format` is 'BMO_CC_ADOBE'.
         """
-        csv_ext = ".csv"
-        if not input.endswith(csv_ext):
-            raise ValueError(f"Input '{input}' does not end with '{csv_ext}'")
+        if input is None:
+            return
+
+        in_ext = (".csv", ".tsv", ".txt")
+        if not input.endswith(in_ext):
+            raise ValueError(f"Input '{input}' does not end with one of '{in_ext}'")
 
         if not os.path.isfile(input):
             raise ValueError(f"Input '{input}' is not an existing file")
 
+        # get key from value: https://stackoverflow.com/a/13149770
+        if input_format is not None:
+            # validate year
+            format = list(GetInputFormats.input_formats.keys())[
+                list(GetInputFormats.input_formats.values()).index(input_format)
+            ]
+            if format == "BMO_CC_ADOBE" and year is None:
+                raise ValueError(f"Must specify 'year' argument when {format=}")
+
+            # validate input file type in more detail
+            if input_format.seperator == "\t" and not input.endswith(".tsv"):
+                raise ValueError(f"Input file should be TSV for {format=}")
+
     def run(self) -> None:
-        logger.info(f"Parsing input file {self.input}")
-        df = parse_input(self.input, self.format)
+        logger.info(f"Parsing input {self.input}")
+        df = parse_input(self.input, self.format, self.year)
         logger.debug(f"input file: {df.shape=}, df.dtypes=\n{df.dtypes}")
         logger.debug(f"df.head()=\n{df.head()}")
 

diff --git a/src/xlbudget/inputformat.py b/src/xlbudget/inputformat.py
@@ -1,34 +1,137 @@
 """Input file format definitions."""
 
+import io
+import sys
 from argparse import Action
-from typing import Dict, List, NamedTuple
+from logging import getLogger
+from typing import Callable, Dict, List, NamedTuple, Optional
 
+import numpy as np
 import pandas as pd
 
 from xlbudget.rwxlb import COLUMNS, df_drop_ignores, df_drop_na
 
+logger = getLogger(__name__)
+
 
 class InputFormat(NamedTuple):
     """Specifies the format of the input file.
 
     Attributes:
         header (int): The 0-indexed row of the header in the input file.
         names (List[str]): The column names.
-        usecols (List[int]): The indices of columns that map to `COLUMNS`.
+        usecols (List[int]): The first len(`COLUMNS`) elements are indices of columns
+            that map to `COLUMNS`, there may indices after for columns required for
+            post-processing.
         ignores (List[str]): Ignore transactions that contain with these regex patterns.
+        pre_processing (Callable): The function to call before `pd.read_csv()`.
+        post_processing (Callable): The function to call after `pd.read_csv()`.
+        sep (str): The separator.
     """
 
     header: int
     names: List[str]
     usecols: List[int]
     ignores: List[str]
+    pre_processing: Callable = lambda input, _: input
+    post_processing: Callable = lambda df: df
+    seperator: str = ","
 
     def get_usecols_names(self):
-        return [self.names[i] for i in self.usecols]
+        return [self.names[i] for i in self.usecols[:3]]
+
+
+# define pre-processing functions below
+
+
+def bmo_cc_adobe_pre_processing(_input: Optional[str], year: str) -> io.StringIO:
+    """Create CSV from input with each element on a new line.
+
+    Args:
+        _input (Optional[str]): The file to process, if `None` then read from stdin.
+        year (str): The year of all transactions.
+
+    Returns:
+        A[n] `io.StringIO` CSV.
+    """
+    # get lines from stdin or file
+    if _input is None:
+        lines = []
+        for line in sys.stdin:
+            lines.append(line.strip())
+    else:
+        with open(_input) as f:
+            lines = f.read().splitlines()
+
+    rows = []
+    i = 0
+    while i < len(lines):
+        elements = lines[i : i + 4]
+
+        # add year to dates
+        elements[0] += f" {year}"
+        elements[1] += f" {year}"
+
+        # add negative sign to amounts that are not credited (CR on next line)
+        if i + 4 < len(lines) and lines[i + 4] == "CR":
+            i += 5
+        else:
+            # check if amount is a float (header will not be float)
+            is_float = True
+            try:
+                float(elements[-1])
+            except ValueError:
+                is_float = False
+
+            if is_float:
+                elements[-1] = "-" + elements[-1]
+
+            i += 4
+
+        row = ",".join(elements) + "\n"
+        rows.append(row)
+
+    new_input = "".join(rows)
+    return io.StringIO(new_input)
+
+
+# define post-processing functions below
+
+
+def bmo_acct_web_post_processing(df: pd.DataFrame) -> pd.DataFrame:
+    """Creates the "Amount" column.
+
+    Args:
+        df (pd.DataFrame): The dataframe to process.
+
+    Returns:
+        A[n] `pd.DataFrame` that combines "Amount" and "Money in" to create "Amount".
+    """
+    df["Amount"] = df["Amount"].replace("[$,]", "", regex=True).astype(float)
+    df["Money in"] = df["Money in"].replace("[$,]", "", regex=True).astype(float)
+    df["Amount"] = np.where(df["Money in"].isna(), df["Amount"], df["Money in"])
+    df = df.drop("Money in", axis=1)
+    return df
+
+
+def bmo_cc_web_post_processing(df: pd.DataFrame) -> pd.DataFrame:
+    """Formats the "Money in/out" column.
+
+    Args:
+        df (pd.DataFrame): The dataframe to process.
+
+    Returns:
+        A[n] `pd.DataFrame` that converts "Money in/out" to a float.
+    """
+    df["Money in/out"] = (
+        df["Money in/out"].replace("[$,]", "", regex=True).astype(float)
+    )
+    return df
 
 
 # define input formats below
 
+
 BMO_ACCT = InputFormat(
     header=3,
     names=[
@@ -42,6 +145,21 @@ def get_usecols_names(self):
     ignores=[r"^\[CW\] TF.*(?:285|593|625)$"],
 )
 
+BMO_ACCT_WEB = InputFormat(
+    header=0,
+    names=[
+        "Date",
+        "Description",
+        "Amount",  # actually named "Money out", but matches after post-processing
+        "Money in",
+        "Balance",
+    ],
+    usecols=[0, 1, 2, 3],
+    ignores=[r"^TF.*(?:285|593|625)$"],
+    post_processing=bmo_acct_web_post_processing,
+    seperator="\t",
+)
+
 BMO_CC = InputFormat(
     header=2,
     names=[
@@ -56,6 +174,32 @@ def get_usecols_names(self):
     ignores=[r"^TRSF FROM.*285"],
 )
 
+BMO_CC_WEB = InputFormat(
+    header=0,
+    names=[
+        "Transaction date",
+        "Description",
+        "Money in/out",
+    ],
+    usecols=[0, 1, 2],
+    ignores=[r"^TRSF FROM.*285"],
+    post_processing=bmo_cc_web_post_processing,
+    seperator="\t",
+)
+
+BMO_CC_ADOBE = InputFormat(
+    header=0,
+    names=[
+        "Transaction Date",
+        "Posting Date",
+        "Description",
+        "Amount",
+    ],
+    usecols=[0, 2, 3],
+    ignores=[r"^TRSF FROM.*285"],
+    pre_processing=bmo_cc_adobe_pre_processing,
+)
+
 
 # define input formats above
 
@@ -76,34 +220,48 @@ def __call__(self, parser, namespace, values, option_string=None):
         setattr(namespace, self.dest, self.input_formats[values])
 
 
-def parse_input(path: str, format: InputFormat) -> pd.DataFrame:
-    """Parses an input file.
+def parse_input(
+    input: Optional[str], format: InputFormat, year: Optional[str]
+) -> pd.DataFrame:
+    """Parses an input.
 
     Args:
-        path (str): The path to the input file.
-        format (InputFormat): The input file format.
+        input (Optional[str]): The path to the input file, if None parse from stdin.
+        format (InputFormat): The input format.
+        year (Optional[str]): The year of all transactions.
 
     Raises:
-        ValueError: If input file contains duplicate transactions.
+        ValueError: If input contains duplicate transactions.
 
     Returns:
         A[n] `pd.DataFrame` where the columns match the xlbudget file's column names.
     """
+    input_initially_none = input is None
+    if input_initially_none:
+        print("Paste your transactions here (CTRL+D twice on a blank line to end):")
+
+    input = format.pre_processing(input, year)
+
     df = pd.read_csv(
-        path,
-        header=format.header,
+        input if input is not None else sys.stdin,
+        sep=format.seperator,
+        index_col=False,
+        names=format.names,
+        header=format.header if input is not None else None,
         usecols=format.usecols,
         parse_dates=[0],
         skip_blank_lines=False,
     )
 
-    df = df_drop_na(df)
+    if input_initially_none:
+        print("---End of transactions---")
 
-    # TODO: write issues to make ignoring duplicate transactions interactive
-    # they might not be an error
-    # TODO: investigate autocompletions
-    if df.duplicated().any():
-        raise ValueError("Input file contains duplicate transactions")
+    df = format.post_processing(df)
+
+    # convert first column to datetime and replace any invalid values with NaT
+    df[df.columns[0]] = pd.to_datetime(df[df.columns[0]], errors="coerce")
+
+    df = df_drop_na(df)
 
     df.columns = df.columns.str.strip()
 
@@ -114,12 +272,20 @@ def parse_input(path: str, format: InputFormat) -> pd.DataFrame:
     df = df.set_axis([c.name for c in COLUMNS], axis="columns")
 
     # sort rows by date
-    df = df.sort_values(by="Date")
+    df = df.sort_values(by=list(df.columns), ascending=True)
 
     # strip whitespace from descriptions
     df["Description"] = df["Description"].str.strip()
 
     # drop ignored transactions
     df = df_drop_ignores(df, "|".join(format.ignores))
 
+    # TODO: write issues to make ignoring identical transactions interactive
+    # TODO: investigate autocompletions
+    if df.duplicated().any():
+        logger.warning(
+            "The following transactions are identical:\n"
+            f"{df[df.duplicated(keep=False)]}"
+        )
+
     return df