In [1]:
import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline
project_root = Path.cwd().parent  # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print(project_root)

/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


'--------------------------------  RELIUS TEST  --------------------------------'

In [2]:
from src import load_data
from src.clean_relius import clean_relius
from src.config import RAW_DATA_DIR

relius_path = RAW_DATA_DIR / "real_relius_2025.xlsx"

relius_raw = load_data.load_relius_excel(path=relius_path, use_sample_if_none=False)
relius_clean = clean_relius(relius_raw)

# .shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
relius_raw.shape, relius_clean.shape

((253, 374), (189, 12))

In [3]:
# Review cleaned and normalized Relius data (top 5 rows by default)
relius_clean.head()

Unnamed: 0,plan_id,ssn,first_name,last_name,state,gross_amt,exported_date,tax_year,dist_code_1,dist_name,dist_category_relius,full_name
0,300004MBD,194420270,David,Symons,PA,3347.62,2025-06-11,0,4,RMD ACH,rmd,David Symons
1,300004MBD,197323440,James,McNamee,PA,12623.05,2025-06-11,0,4,RMD ACH,rmd,James McNamee
2,300004MBD,194420270,David,Symons,PA,2652.38,2025-06-13,0,4,Partial Liquidation Gross ACH,partial_cash,David Symons
4,300004MBD,197526965,David,Kich,PA,3000.0,2025-06-17,0,4,Partial Liquidation Gross ACH,partial_cash,David Kich
6,300004MBD,197526965,David,Kich,PA,3000.0,2025-07-03,0,4,Recurring ACH,partial_cash,David Kich


In [4]:
# .value_counts() returns a Series containing the counts of unique elements. In descending order
# Will show up to 20 rows .head(20)
relius_clean["dist_name"].value_counts().head(20)

dist_name
Recurring ACH                     90
Partial Liquidation Net ACH       27
RMD ACH                           19
Recurring Check Net               15
Partial Liquidation Gross ACH     13
RMD Check Net                      8
RMD Check Gross                    4
Partial Liquidation  Net Check     4
Rollover                           3
Partial Liquidate  Gross Check     3
Full Liquidation                   2
Partial Rollover - Net             1
Name: count, dtype: int64

In [5]:
relius_clean["dist_category_relius"].value_counts()

dist_category_relius
partial_cash        152
rmd                  31
rollover              3
final_cash            2
partial_rollover      1
Name: count, dtype: int64

'--------------------------------  MATRIX TEST  --------------------------------'

In [6]:
from src.clean_matrix import clean_matrix

matrix_path = RAW_DATA_DIR / "real_matrix_2025.xlsx"

matrix_raw = load_data.load_matrix_excel(path=matrix_path, use_sample_if_none=False)
matrix_clean = clean_matrix(matrix_raw)

# .shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
matrix_raw.shape, matrix_clean.shape

  return pd.to_datetime(series, errors="coerce").dt.date


((60972, 56), (6980, 14))

In [9]:
# Review cleaned and normalized Matrix data (top 5 rows by default)
matrix_clean.head()

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,txn_date,txn_method,tax_code_1,tax_code_2,tax_form,dist_type,transaction_id,matrix_account,partipant_name
29932,IRA127PLAT,1943620320,Sandra Marsh,PA,153.22,2025-11-04,ACH Distribution,7NORMALDISTRIBUTION,NAN,1099-R,,44178108.0,07P6LM3M,Sandra Marsh
29933,IRA127PLAT,2095015620,Kathryn Clausen,PA,200.0,2025-11-04,ACH Distribution,7NORMALDISTRIBUTION,NAN,1099-R,,44178107.0,07P6LM3M,Kathryn Clausen
29934,IRA127PLAT,1943620320,Sandra Marsh,PA,153.22,2025-10-16,ACH Distribution,7NORMALDISTRIBUTION,NAN,1099-R,,43696486.0,07P6LM3M,Sandra Marsh
29936,IRA127PLAT,2095015620,Kathryn Clausen,PA,200.0,2025-10-03,ACH Distribution,7NORMALDISTRIBUTION,NAN,1099-R,,43237695.0,07P6LM3M,Kathryn Clausen
29938,IRA127PLAT,1943620320,Sandra Marsh,PA,153.22,2025-09-03,ACH Distribution,7NORMALDISTRIBUTION,NAN,1099-R,,42348013.0,07P6LM3M,Sandra Marsh


In [7]:
# Check that unwanted accounts are gone
matrix_clean["matrix_account"].value_counts().head(50)

matrix_account
07P6LM97    455
07P6LM4K    386
07P6LM4C    126
07P6LM3T    124
07P6LM6D    104
07P6LM49    102
07P6LM5G     95
07P6LM3X     87
07P6LMC7     78
07P6LM9R     75
07P6LNYW     69
07P6LM45     69
07P6LM54     65
07P6LM65     55
07P6LM46     42
07P6LM6P     42
07P6LM9V     41
07P6LM9M     39
07P6LM5D     38
07P6LM3Z     38
07P6LM3V     38
07P6LM68     37
07P6LMC9     36
07P6LMM2     34
07P6LQJM     33
07P6LN9D     32
07P6LN24     32
07P6LPCV     31
07P6LQQT     31
07P6LMYL     30
07P6LM4M     29
07P6LM9X     29
07P6LM8N     26
07P6LQVW     26
07P6LNG4     25
07P6LQKQ     25
07P6LMCC     25
07P6LNN8     24
07P6LP9L     24
07P6LQ4L     24
07P6LQ2V     22
07P6LQQZ     22
07P6LPH9     22
07P6LM3W     22
07P6LQXM     21
07P6LPM4     21
07P6LNJV     21
07P6LNMD     21
07P6LNFG     21
07P6LPG4     21
Name: count, dtype: int64

In [8]:
# Check that unwanted txn types are gone
matrix_clean["txn_method"].value_counts().head(20)

txn_method
ACH Distribution      5450
Check Distribution    1421
Tax Record Only         56
Check Reissue           36
Wire Distribution       17
Name: count, dtype: int64