In [7]:
import pandas as pd
import re
import calendar
from pathlib import Path
from typing import List, Optional, Union  # ← this line is essential!


In [14]:
path = 'E:\\dtuklaptop\\e\\Users\\Mat\python\\data\\property\\bank-download\\'
generated_path = 'E:\\dtuklaptop\\e\\Users\\Mat\\python\\data\\property\\generated\\'

In [17]:
def split_transactions_by_month(
    input_csv: Union[str, Path],
    output_dir: Optional[Union[str, Path]] = None,
    account_prefix_override: Optional[str] = None,
    encoding: str = "utf-8",
    verbose: bool = True,
) -> List[Path]:
    """
    Read a CSV with columns:
      Number, Date, Account, Amount, Subcategory, Memo
    where Date is in UK format (dd/MM/YYYY). Split by (year, month),
    sort ascending by date, and write CSVs named:
      BC_XXXX_MONYYYY.csv
    """

    input_csv = Path(input_csv)
    if output_dir is None:
        output_dir = input_csv.parent
    else:
        output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Read CSV; keep Account as string so leading zeros are preserved
    df = pd.read_csv(input_csv, dtype={"Account": str}, encoding=encoding)

    # Normalize column names
    df.columns = [c.strip().title() for c in df.columns]
    required = ["Number", "Date", "Account", "Amount", "Subcategory", "Memo"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError("Missing required columns: {}".format(missing))

    # Parse UK-style dates and sort ascending
    df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")
    if df["Date"].isna().any():
        bad = df[df["Date"].isna()][["Date"]]
        raise ValueError(
            "Some Date values could not be parsed with dd/MM/YYYY. "
            "Offending rows:\n{}".format(bad.head().to_string(index=False))
        )

    df = df.sort_values("Date", ascending=True).reset_index(drop=True)

    # Extract 4-digit account prefix
    def get_prefix(acc):
        digits = re.sub(r"\D", "", str(acc))
        if len(digits) < 4:
            raise ValueError("Account '{}' doesn’t contain 4 digits.".format(acc))
        return digits[:4]

    if account_prefix_override:
        acct_prefix = account_prefix_override
        df["_AcctPrefix"] = acct_prefix
    else:
        df["_AcctPrefix"] = df["Account"].apply(get_prefix)

    df["_Year"] = df["Date"].dt.year
    df["_MonthNum"] = df["Date"].dt.month
    df["_MonthAbbr"] = df["_MonthNum"].apply(lambda m: calendar.month_abbr[m].upper())

    written_paths = []
    for (acct4, year, mon_abbr), g in df.groupby(["_AcctPrefix", "_Year", "_MonthAbbr"]):
        out_name = "BC_{}_{}{}.csv".format(acct4, mon_abbr, year)
        out_path = output_dir / out_name
        g = g.sort_values("Date")
        g[["Number", "Date", "Account", "Amount", "Subcategory", "Memo"]].to_csv(
            out_path, index=False, date_format="%d/%m/%Y"
        )
        written_paths.append(out_path)

    if verbose:
        print("Wrote {} file(s) to '{}'".format(len(written_paths), output_dir))

    return written_paths

In [23]:
import pandas as pd
import re
import calendar
import csv
from pathlib import Path
from typing import List, Optional, Union

EXPECTED_COLS = ["Number", "Date", "Account", "Amount", "Subcategory", "Memo"]

def _robust_read_csv(path, encoding="utf-8"):
    """
    Read a 'mostly CSV' file where some rows may have extra commas.
    Any extra fields beyond the first 5 are joined back into Memo.
    Preserves header if present; otherwise treats all rows as data.
    """
    rows = []
    with open(path, "r", encoding=encoding, newline="") as f:
        reader = csv.reader(f)  # handles quotes if they exist
        try:
            header = next(reader)
        except StopIteration:
            return pd.DataFrame(columns=EXPECTED_COLS)

        # Normalize header and decide if it's a real header
        header_norm = [str(h).strip().title() for h in header]
        has_header = header_norm == EXPECTED_COLS

        if not has_header:
            # First row is actually data → process it
            data_row = header
            if len(data_row) < 6:
                data_row += [""] * (6 - len(data_row))
            elif len(data_row) > 6:
                fixed = data_row[:5]
                memo = ",".join(data_row[5:])
                data_row = fixed + [memo]
            rows.append(data_row)

        for row in reader:
            if not row or all((c is None or str(c).strip() == "") for c in row):
                continue
            if len(row) < 6:
                row += [""] * (6 - len(row))
            elif len(row) > 6:
                fixed = row[:5]
                memo = ",".join(row[5:])
                row = fixed + [memo]
            rows.append(row)

    df = pd.DataFrame(rows, columns=EXPECTED_COLS)
    # Normalize column names just in case
    df.columns = [c.strip().title() for c in df.columns]
    return df

def split_transactions_by_month(
    input_csv: Union[str, Path],
    output_dir: Optional[Union[str, Path]] = None,
    account_prefix_override: Optional[str] = None,
    encoding: str = "utf-8",
    verbose: bool = True,
) -> List[Path]:
    """
    Read a CSV with columns:
      Number, Date, Account, Amount, Subcategory, Memo
    where Date is UK format (dd/MM/YYYY). Split by year+month, sort by date asc,
    and write CSVs named: BC_XXXX_MONYYYY.csv
    """
    input_csv = Path(input_csv)
    output_dir = Path(output_dir) if output_dir else input_csv.parent
    output_dir.mkdir(parents=True, exist_ok=True)

    # First attempt: normal pandas read (fast)
    try:
        df = pd.read_csv(
            input_csv,
            dtype={"Account": str},
            encoding=encoding,
            engine="c",
        )
        df.columns = [c.strip().title() for c in df.columns]
        # Validate expected columns quickly; otherwise fall back
        if any(col not in df.columns for col in EXPECTED_COLS):
            raise ValueError("Missing required columns in header; trying robust read…")
    except Exception:
        # Fallback: tolerant parser that stitches extra commas back into Memo
        df = _robust_read_csv(str(input_csv), encoding=encoding)

    # Final column check
    missing = [c for c in EXPECTED_COLS if c not in df.columns]
    if missing:
        raise ValueError("Missing required columns after parsing: {}".format(missing))

    # Parse UK dates and sort
    df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")
    if df["Date"].isna().any():
        bad = df[df["Date"].isna()][["Number", "Date", "Account"]].head()
        raise ValueError(
            "Some Date values could not be parsed as dd/MM/YYYY. "
            "Example bad rows:\n{}".format(bad.to_string(index=False))
        )
    df = df.sort_values("Date", ascending=True).reset_index(drop=True)

    # Extract 4-digit account prefix
    def get_prefix(acc):
        """
        Extract first four digits of the *account number* from strings like:
          "11-22-33 44445555"  -> "4444"
          "11 22 33   01234567"-> "0123"
          "Sort: 12-34-56 Acc: 98765432" -> "9876"
        Strategy: find the last contiguous run of digits in the string; require ≥4 digits.
        """
        s = "" if acc is None else str(acc)
        # Find all digit runs; pick the last one (expected to be the account number)
        m = list(re.finditer(r"\d+", s))
        if not m:
            raise ValueError("Account '{}' contains no digits.".format(acc))
        last_run = m[-1].group(0)
        if len(last_run) < 4:
            raise ValueError("Account '{}' does not have ≥4 digits in the account number.".format(acc))
        return last_run[:4]

    written_paths = []
    # Group by account prefix + year + month abbreviation
    for (acct4, year, mon_abbr), g in df.groupby(["_AcctPrefix", "_Year", "_MonthAbbr"]):
        out_name = "BC_{}_{}{}.csv".format(acct4, mon_abbr, year)
        out_path = output_dir / out_name
        g = g.sort_values("Date")
        g[EXPECTED_COLS].to_csv(out_path, index=False, date_format="%d/%m/%Y")
        written_paths.append(out_path)

    if verbose:
        print("Wrote {} file(s) to '{}'".format(len(written_paths), output_dir))
    return written_paths


In [25]:
import pandas as pd
import re
import calendar
import csv
from pathlib import Path
from typing import List, Optional, Union

# Expected column order for output
EXPECTED_COLS = ["Number", "Date", "Account", "Amount", "Subcategory", "Memo"]

def _robust_read_csv(path, encoding="utf-8"):
    """
    Read a 'mostly CSV' file where some rows may contain extra commas (typically in Memo).
    Any extra fields beyond the first 5 are joined back into Memo.
    Preserves header if present; otherwise treats all rows as data.
    """
    rows = []
    with open(path, "r", encoding=encoding, newline="") as f:
        reader = csv.reader(f)  # handles quotes properly if present
        try:
            header = next(reader)
        except StopIteration:
            return pd.DataFrame(columns=EXPECTED_COLS)

        # Normalize header and decide if it's a real header
        header_norm = [str(h).strip().title() for h in header]
        has_header = header_norm == EXPECTED_COLS

        if not has_header:
            # First row is actually data → process it into 6 fields
            data_row = header
            if len(data_row) < 6:
                data_row += [""] * (6 - len(data_row))
            elif len(data_row) > 6:
                fixed = data_row[:5]
                memo = ",".join(data_row[5:])
                data_row = fixed + [memo]
            rows.append(data_row)

        for row in reader:
            # Skip completely empty rows
            if not row or all((c is None or str(c).strip() == "") for c in row):
                continue
            # Normalize to exactly 6 fields
            if len(row) < 6:
                row += [""] * (6 - len(row))
            elif len(row) > 6:
                fixed = row[:5]
                memo = ",".join(row[5:])
                row = fixed + [memo]
            rows.append(row)

    df = pd.DataFrame(rows, columns=EXPECTED_COLS)
    df.columns = [c.strip().title() for c in df.columns]
    return df

def get_prefix(acc):
    """
    Extract the first four digits of the *account number* from strings like:
      "11-22-33 44445555"      -> "4444"
      "11 22 33   01234567"    -> "0123"
      "Sort: 12-34-56 Acc: 98765432" -> "9876"
    Strategy: find the *last* contiguous run of digits (account number) and take its first 4.
    """
    s = "" if acc is None else str(acc)
    matches = list(re.finditer(r"\d+", s))
    if not matches:
        raise ValueError("Account '{}' contains no digits.".format(acc))
    account_digits = matches[-1].group(0)  # last run of digits = account number
    if len(account_digits) < 4:
        raise ValueError("Account '{}' does not have ≥4 digits in the account number.".format(acc))
    return account_digits[:4]

def split_transactions_by_month(
    input_csv: Union[str, Path],
    output_dir: Optional[Union[str, Path]] = None,
    account_prefix_override: Optional[str] = None,
    encoding: str = "utf-8",
    verbose: bool = True,
) -> List[Path]:
    """
    Read a CSV with columns:
      Number, Date, Account, Amount, Subcategory, Memo
    where Date is in UK format (dd/MM/YYYY). Split by (year, month), sort by
    ascending date, and write CSVs named:
      BC_XXXX_MONYYYY.csv

    - XXXX: first four digits of the *account number* (last digit-run in Account field)
    - MON : three-letter month abbreviation in upper-case
    - YYYY: four-digit year
    """
    input_csv = Path(input_csv)
    out_dir = Path(output_dir) if output_dir else input_csv.parent
    out_dir.mkdir(parents=True, exist_ok=True)

    # 1) Fast path with pandas; 2) Fallback to robust reader on error
    try:
        df = pd.read_csv(input_csv, dtype={"Account": str}, encoding=encoding, engine="c")
        df.columns = [c.strip().title() for c in df.columns]
        if any(col not in df.columns for col in EXPECTED_COLS):
            raise ValueError("Missing required columns in header; trying robust read…")
    except Exception:
        df = _robust_read_csv(str(input_csv), encoding=encoding)

    # Final column check
    missing = [c for c in EXPECTED_COLS if c not in df.columns]
    if missing:
        raise ValueError("Missing required columns after parsing: {}".format(missing))

    # Parse UK dates strictly and sort ascending
    df["Date"] = pd.to_datetime(df["Date"], format="%d/%m/%Y", errors="coerce")
    if df["Date"].isna().any():
        bad = df[df["Date"].isna()][["Number", "Date", "Account"]].head()
        raise ValueError(
            "Some Date values could not be parsed as dd/MM/YYYY. "
            "Example bad rows:\n{}".format(bad.to_string(index=False))
        )
    df = df.sort_values("Date", ascending=True).reset_index(drop=True)

    # Determine 4-digit prefix
    if account_prefix_override:
        if not re.fullmatch(r"\d{4}", str(account_prefix_override)):
            raise ValueError("account_prefix_override must be exactly 4 digits.")
        df["_AcctPrefix"] = str(account_prefix_override)
    else:
        df["_AcctPrefix"] = df["Account"].apply(get_prefix)

    # Year/Month for grouping and filenames
    df["_Year"] = df["Date"].dt.year
    df["_MonthNum"] = df["Date"].dt.month

    written_paths = []
    # Group by account prefix + year + month number to guarantee chronological order
    for (acct4, year, month_num), g in df.groupby(["_AcctPrefix", "_Year", "_MonthNum"], sort=True):
        mon_abbr = calendar.month_abbr[int(month_num)].upper()
        out_name = "BC_{}_{}{}.csv".format(acct4, mon_abbr, year)
        out_path = out_dir / out_name

        g = g.sort_values("Date", ascending=True)
        g[EXPECTED_COLS].to_csv(out_path, index=False, date_format="%d/%m/%Y")
        written_paths.append(out_path)

    if verbose:
        print("Wrote {} file(s) to '{}'".format(len(written_paths), out_dir))
    return written_paths


In [28]:
input_path = "J://My Drive//NAS//My Documents//Business//Property//Statements//working//python//data//property//bank-download//6045_2025.csv"
split_transactions_by_month(
    input_path,
    output_dir="J://My Drive//NAS//My Documents//Business//Property//Statements//working//python//data//property//bank-download"
)

Wrote 10 file(s) to 'J:\My Drive\NAS\My Documents\Business\Property\Statements\working\python\data\property\bank-download'


[WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_JAN2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_FEB2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_MAR2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_APR2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_MAY2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_JUN2025.csv'),
 WindowsPath('J:/My Drive/NAS/My Documents/Business/Property/Statements/working/python/data/property/bank-download/BC_6045_JUL2025.csv'),
 WindowsPath('J:/My Drive/NAS/My D

### Split big Barclays file by month

- input file is just a download from Barclays:
- output files are 'BC_XXXX_MMMYYYY.csv'