In [41]:
import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline
project_root = Path.cwd().parent  # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print(project_root)

/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


'--------------------------------  Test Cleaned DataFrame  --------------------------------'

'-----------------------  Relius Test  -----------------------'

In [42]:
from src import load_data
from src.clean_relius import clean_relius
from src.config import RAW_DATA_DIR

relius_path = RAW_DATA_DIR / "real_relius_2025.xlsx"

relius_raw = load_data.load_relius_excel(path=relius_path, use_sample_if_none=False)
relius_clean = clean_relius(relius_raw)

# .shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
relius_raw.shape, relius_clean.shape

((253, 374), (189, 12))

Notes:
- Before cleaning process:
    - 253 rows, after cleaning process 189, removing rows without key information and duplicates.
    - 374 columns, after cleaning process 12, just the core columns we need for our matching and correction file
- After cleaning process all columns were renamed  and values normalized to keep the same format between Relius and Matrix, see below:

In [43]:
# Review cleaned and normalized Relius data (top 5 rows by default)
relius_clean.head()

Unnamed: 0,plan_id,ssn,first_name,last_name,state,gross_amt,exported_date,tax_year,dist_code_1,dist_name,dist_category_relius,full_name
0,300004MBD,194420270,David,Symons,PA,3347.62,2025-06-11,0,4,RMD ACH,rmd,David Symons
1,300004MBD,197323440,James,McNamee,PA,12623.05,2025-06-11,0,4,RMD ACH,rmd,James McNamee
2,300004MBD,194420270,David,Symons,PA,2652.38,2025-06-13,0,4,Partial Liquidation Gross ACH,partial_cash,David Symons
4,300004MBD,197526965,David,Kich,PA,3000.0,2025-06-17,0,4,Partial Liquidation Gross ACH,partial_cash,David Kich
6,300004MBD,197526965,David,Kich,PA,3000.0,2025-07-03,0,4,Recurring ACH,partial_cash,David Kich


In [44]:
# .value_counts() returns a Series containing the counts of unique elements. In descending order
# Will show up to 20 rows .head(20)
relius_clean["dist_name"].value_counts().head(20)

dist_name
Recurring ACH                     90
Partial Liquidation Net ACH       27
RMD ACH                           19
Recurring Check Net               15
Partial Liquidation Gross ACH     13
RMD Check Net                      8
RMD Check Gross                    4
Partial Liquidation  Net Check     4
Rollover                           3
Partial Liquidate  Gross Check     3
Full Liquidation                   2
Partial Rollover - Net             1
Name: count, dtype: int64

In [45]:
relius_clean["dist_category_relius"].value_counts()

dist_category_relius
partial_cash        152
rmd                  31
rollover              3
final_cash            2
partial_rollover      1
Name: count, dtype: int64

Notes:
- From the values in "dist_name" we can group them in certain categories to establish different process flows for each category type.
    - See above the categories after a new Series has been implemented.

'-----------------------  Matrix Test  -----------------------'

In [46]:
from src.clean_matrix import clean_matrix

matrix_path = RAW_DATA_DIR / "real_matrix_2025.xlsx"

matrix_raw = load_data.load_matrix_excel(path=matrix_path, use_sample_if_none=False)
matrix_clean = clean_matrix(matrix_raw)

# .shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
matrix_raw.shape, matrix_clean.shape

  return pd.to_datetime(series, errors="coerce").dt.date


((60972, 56), (6980, 14))

Notes:
- Before cleaning process:
    - 60,972 ROWS, after cleaning process 6,980, mostly removing rows that we don't need for our matching process (other Matrix acct types, transaction types, etc.).
    - 56 columns, after cleaning process 14, just the core columns we need for our matching and correction file
- After cleaning process all columns were renamed  and values normalized to keep the same format between Relius and Matrix, see below:

In [47]:
# Review cleaned and normalized Matrix data (top 5 rows by default)
matrix_clean.head(10)

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,txn_date,txn_method,tax_code_1,tax_code_2,tax_form,dist_type,transaction_id,matrix_account,partipant_name
29932,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,2025-11-04,ACH Distribution,7,,1099-R,,44178108,07P6LM3M,Sandra Marsh
29933,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,2025-11-04,ACH Distribution,7,,1099-R,,44178107,07P6LM3M,Kathryn Clausen
29934,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,2025-10-16,ACH Distribution,7,,1099-R,,43696486,07P6LM3M,Sandra Marsh
29936,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,2025-10-03,ACH Distribution,7,,1099-R,,43237695,07P6LM3M,Kathryn Clausen
29938,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,2025-09-03,ACH Distribution,7,,1099-R,,42348013,07P6LM3M,Sandra Marsh
29939,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,2025-09-03,ACH Distribution,7,,1099-R,,42348012,07P6LM3M,Kathryn Clausen
29941,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,2025-08-05,ACH Distribution,7,,1099-R,,41521055,07P6LM3M,Sandra Marsh
29942,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,2025-08-05,ACH Distribution,7,,1099-R,,41521056,07P6LM3M,Kathryn Clausen
29944,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,2025-07-03,ACH Distribution,7,,1099-R,,40587387,07P6LM3M,Sandra Marsh
29945,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,2025-07-03,ACH Distribution,7,,1099-R,,40587386,07P6LM3M,Kathryn Clausen


Notes:
- After running the DataFrame for the first time: SSN was 10 digits and tax_code_1 was '7NORMALDISTRIBUTION' instead of just '7' or 'G'
- These fields were not rendered correctly (using logic from clean_relius module), and some additional logic were needed:
    - Matrix SSN value is being read as a float in Excel so a 111223344 SSN was stored as 111223344.0 in our raw DataFrame giving 1112233440
        - 1) implemented str convertion from Float
        - 2) extract the first 9 digits (safe since SSNs are always 9 digits)
    - Matrix Tax code values are '7 - Normal Distribution' or 'G - Rollover' so a different logic was applied than clean_relius using REGEX:
        - 1) remove any 'CODE'/'code' before the code digit, e.g. 'CODE' 7.
        - 2) grab the alphanumerical digits next '7' or 'G'.
        - 3) extract the first digit (safe since the tax code is always 1 digit)
    - Matrix transaction_id is also being read as a float in Excel so were getting an extra digit that needed to be removed
        - 1) implemented str convertion from Float '44556677.0'
        - 2) extract all non-numerical characters '.' -> '445566770'
        - 3) extract all numerical characters until the last '0' -> '44556677'

In [48]:
# Experiments with re.search() and .group() to solve transaction_id normalization
import re

value = float(12345.0)

text = str(value).strip()

text = re.sub(r"\D", "", text)

m = re.search(r"(\d+)0$", text)

# .group() return full match or sub groups (from parenthesis)

#m.group(0) -> '123450'
m.group(1) #-> correct: '12345'
#m.group(2) -> 'Error: no such group'


'12345'

Research:
- Needed to strengthen my knowledge in regex with re.search and .group():
    - Example: extract parts of '12345-ABCD'
        1) m = re.search(r"(\d+)-([A-Z]+), "12345-ABCD")
        2) m.group(0) -> "12345-ABCD"
        3) m.group(1) -> "12345"
        4) m.group(2) -> "ABCD"
    - Parenthesis are what define groups.

In [49]:
# Check tax_code_1 to see format
matrix_clean["tax_code_1"].value_counts()

tax_code_1
7    5087
G     833
1     410
2     309
B     140
4      94
H      93
3       3
D       1
Name: count, dtype: int64

Notes:
- After logic was updated tax_code_1 was normalized correctly, so it shows the exact codes we need.

In [50]:
# Check that unwanted accounts are gone
matrix_clean["matrix_account"].value_counts().head(20)

matrix_account
07P6LM97    455
07P6LM4K    386
07P6LM4C    126
07P6LM3T    124
07P6LM6D    104
07P6LM49    102
07P6LM5G     95
07P6LM3X     87
07P6LMC7     78
07P6LM9R     75
07P6LNYW     69
07P6LM45     69
07P6LM54     65
07P6LM65     55
07P6LM46     42
07P6LM6P     42
07P6LM9V     41
07P6LM9M     39
07P6LM5D     38
07P6LM3Z     38
Name: count, dtype: int64

Notes:
- Verified: After cleaned DataFrame not showing matrix_account we don't need in our matching and correction process.

In [51]:
# Check that unwanted txn types are gone
matrix_clean["txn_method"].value_counts().head(20)

txn_method
ACH Distribution      5450
Check Distribution    1421
Tax Record Only         56
Check Reissue           36
Wire Distribution       17
Name: count, dtype: int64

Notes:
- Verified: After cleaned DataFrame not showing txn_method we don't need in our matching and correction process.