In [None]:
# Cell 1 - Tell python where to find the modules to import from

import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline if notebook runs from the repo root or from notebooks/
cwd = Path.cwd()
project_root = cwd if (cwd / "src").exists() else cwd.parent # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print("Project root: ", project_root)

/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


'--------------------------------  Test Load Data and Clean DataFrames  --------------------------------'

In [2]:
# Cell 2 — Imports, Load & Clean inputs (real paths)

from src import load_data
from src.clean_matrix import clean_matrix
from src.clean_relius_demo import clean_relius_demo
from src.age_taxcode_analysis import run_age_taxcode_analysis
from src.build_correction_file import build_correction_dataframe, write_correction_file

from src.config import RAW_DATA_DIR

relius_demo_path = RAW_DATA_DIR / "real_demo_relius_2025.xlsx"
matrix_path = RAW_DATA_DIR / "real_all_matrix_2025.xlsx"

# Load  and clean Matrix raw data
matrix_raw = load_data.load_matrix_excel(path=matrix_path, use_sample_if_none=False)
matrix_clean = clean_matrix(matrix_raw)

# Load and clean Relius demo raw data
relius_demo_raw = load_data.load_relius_demo_excel(path=relius_demo_path, use_sample_if_none=False)
relius_demo_clean = clean_relius_demo(relius_demo_raw)


#.shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
print("\n")
print(f"Matrix Cleaned DataFrame shape: {matrix_clean.shape}")
print(f"Relius Demo Cleaned DataFrame shape: {relius_demo_clean.shape}")

  - Find the first numeric digits '\d' before the ending 0 and return it
  matrix_clean = clean_matrix(matrix_raw)
  return pd.to_datetime(series, errors=errors, format=format, dayfirst=dayfirst).dt.date




Matrix Cleaned DataFrame shape: (6980, 21)
Relius Demo Cleaned DataFrame shape: (61584, 6)


In [3]:
# Cell 3 — Review Dtype per columns and forst 10 ros of cleaned DataFrame

print(relius_demo_clean.info())
relius_demo_clean.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 61584 entries, 2110 to 58800
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   plan_id     61584 non-null  string
 1   ssn         61584 non-null  string
 2   first_name  61584 non-null  string
 3   last_name   61584 non-null  string
 4   dob         61527 non-null  object
 5   term_date   12358 non-null  object
dtypes: object(2), string(4)
memory usage: 3.3+ MB
None


Unnamed: 0,plan_id,ssn,first_name,last_name,dob,term_date
2110,100MBD,113741850,Donald,Mangan,1988-11-12,NaT
2197,100MBD,116648243,Deborah,Mosloskie,1966-09-26,NaT
3595,100MBD,145609822,Beth,Conley,1968-03-22,NaT
4751,100MBD,159460981,Paul J,Petrosky,1952-04-12,2013-09-01
5327,100MBD,159667416,William,Delaney,1969-06-21,NaT
6153,100MBD,160643429,Diana,Hanobeck,1973-07-25,NaT
6270,100MBD,160667435,Christina,Miller,1985-06-16,NaT
7795,100MBD,162581348,Yvonne,Benedict,1961-11-12,NaT
7958,100MBD,162642206,Melanie,Ellixson,1975-12-08,NaT
8886,100MBD,163607906,Richard,Borcky,1963-06-18,NaT


'--------------------------------  Test Merging DataFrames and Analysis Engine --------------------------------'

In [4]:
# Cell 4 — Run Merging and Analysis Engine

age_matches = run_age_taxcode_analysis(matrix_clean, relius_demo_clean)

age_matches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6693 entries, 0 to 6976
Data columns (total 43 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   plan_id                         6693 non-null   string 
 1   ssn                             6693 non-null   string 
 2   participant_name                6687 non-null   object 
 3   state                           6654 non-null   string 
 4   gross_amt                       6693 non-null   float64
 5   fed_taxable_amt                 6693 non-null   float64
 6   txn_date                        6693 non-null   object 
 7   txn_method                      6693 non-null   string 
 8   tax_code_1                      6684 non-null   string 
 9   tax_code_2                      17 non-null     string 
 10  tax_form                        6693 non-null   object 
 11  dist_type                       243 non-null    string 
 12  roth_initial_contribution_year  0 non-n

In [5]:
# Cell 5 — Review tax codes in Series (1 or 2 digits)

age_matches[["tax_code_1"]].value_counts()

tax_code_1
7             5084
G              830
2              309
11             223
1              122
4               93
15              10
17               4
16               3
3                2
13               1
18               1
33               1
D                1
Name: count, dtype: int64

In [6]:
# Cell 6 — Review dtype of age at distribution Series and first at merged and analized DataFrame

print("age_at_distribution_year dtype: ", age_matches["age_at_distribution_year"].dtype)
age_matches.head(10)

age_at_distribution_year dtype:  Float64


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
0,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,153.22,2025-11-04,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
1,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,200.0,2025-11-04,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
2,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,153.22,2025-10-16,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
3,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,200.0,2025-10-03,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
4,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,153.22,2025-09-03,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
5,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,200.0,2025-09-03,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
6,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,153.22,2025-08-05,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
7,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,200.0,2025-08-05,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
8,IRA127PLAT,194362032,Sandra Marsh,PA,153.22,153.22,2025-07-03,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,
9,IRA127PLAT,209501562,Kathryn Clausen,PA,200.0,200.0,2025-07-03,ACH Distribution,7,,...,True,False,7,,age_59_5_or_over_normal_distribution,,perfect_match,True,7,


In [7]:
# Cell 7 — Test no Roth plans in merged DataFrame

non_roth_test = age_matches[age_matches["plan_id"].str.startswith("300005") | age_matches["plan_id"].str.endswith("R")]
assert non_roth_test.empty, "WE HAVE ROTH DISBURSEMENTS!"

print("No Roth Disbursements in DataFrame!")

non_roth_test.head()

No Roth Disbursements in DataFrame!


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2


In [8]:
# Cell 8 — Test rows with G tax code are excluded from analysis engine

rollover_test = age_matches[age_matches["tax_code_1"].eq("G")]
rollover_test.head()

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
20,IRA29PLAT,193361668,Robert Klemow,PA,218.92,0.0,2025-11-04,Check Distribution,G,,...,True,True,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
40,IRA29PLAT,161389368,Letitia Leitzel,FL,11.27,0.0,2025-10-06,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
73,IRA29PLAT,182601810,Thomas Richards,PA,200000.0,0.0,2025-09-02,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
93,IRA29PLAT,161389368,Letitia Leitzel,FL,15749.86,0.0,2025-07-31,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
155,IRA32PLAT,181466683,Martha Laux,PA,177377.67,0.0,2025-08-13,Check Reissue,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,


In [9]:
# Cell 9 — Review rows that match status is not 'perfect_match'

logic_test = age_matches[age_matches["match_status"] != "perfect_match"]

logic_test.head(15)

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
20,IRA29PLAT,193361668,Robert Klemow,PA,218.92,0.0,2025-11-04,Check Distribution,G,,...,True,True,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
40,IRA29PLAT,161389368,Letitia Leitzel,FL,11.27,0.0,2025-10-06,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
73,IRA29PLAT,182601810,Thomas Richards,PA,200000.0,0.0,2025-09-02,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
93,IRA29PLAT,161389368,Letitia Leitzel,FL,15749.86,0.0,2025-07-31,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
94,IRA29PLAT,168669091,Jared Fielder,VA,1013.52,1013.52,2025-07-30,ACH Distribution,4,,...,False,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
95,IRA29PLAT,177649944,Brandon Fiedler,PA,1013.53,1013.53,2025-07-30,ACH Distribution,4,,...,False,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
96,IRA29PLAT,185503634,Debra Garman,PA,2027.05,2027.05,2025-07-30,ACH Distribution,4,,...,False,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
155,IRA32PLAT,181466683,Martha Laux,PA,177377.67,0.0,2025-08-13,Check Reissue,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
160,IRA32PLAT,181466683,Martha Laux,PA,177377.67,0.0,2025-07-28,Check Distribution,G,,...,True,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,
161,IRA32PLAT,202621585,Tonilee Rapp,PA,2117.65,2117.65,2025-07-03,ACH Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,


In [10]:
# Cell 10 — Review values and their counts in 'match_status'

age_matches["match_status"].value_counts()

match_status
perfect_match                                     5032
excluded_from_age_engine_rollover_or_inherited    1298
match_needs_correction                             290
age_rule_insufficient_data                          73
Name: count, dtype: int64

In [11]:
# Cell 11 — Review values and their counts in 'match_status' per unique SSN

filter_df = age_matches.drop_duplicates(subset="ssn")
print(filter_df["match_status"].value_counts())

filter_df[filter_df["match_status"] == "age_rule_insufficient_data"].head(15)

match_status
perfect_match                                     2115
excluded_from_age_engine_rollover_or_inherited    1009
match_needs_correction                              95
age_rule_insufficient_data                          30
Name: count, dtype: int64


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
161,IRA32PLAT,202621585,Tonilee Rapp,PA,2117.65,2117.65,2025-07-03,ACH Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
244,IRA109PLAT,131644147,Planned Parenthood Federation,NY,1000.0,1000.0,2025-08-11,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
304,IRA68PLAT,111111111,Food Bank Of Delaware,DE,600.0,600.0,2025-08-26,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
494,IRA147PLAT,251892186,Albert Gallatin Ed Found,PA,5000.0,5000.0,2025-09-04,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
735,300001MBD,232169885,Lycoming Valley Baptist Church,PA,100.0,100.0,2025-11-06,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
736,300001MBD,222856783,The Christian Gospel Fellowshi,PA,100.0,100.0,2025-11-06,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
737,300001MBD,205940449,I-TEC 23,PA,200.0,200.0,2025-11-06,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
738,300001MBD,232090782,Community Baptist Church of Mo,PA,500.0,500.0,2025-11-06,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
739,300001MBD,251458329,Eldred First Church of God,PA,740.0,740.0,2025-11-06,Check Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,
1107,300001MBD,160707297,Gerald Richardson,PA,6666.67,6666.67,2025-06-11,ACH Distribution,7,,...,False,False,,,,,age_rule_insufficient_data,False,,


In [12]:
# Cell 12 — Review rows with 'match_needs_correction' status

age_taxcode = age_matches[age_matches["match_status"].eq("match_needs_correction")]

print(f" CORRECTION rows: {len(age_taxcode)}")
age_taxcode.head(15)

 CORRECTION rows: 290


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
240,IRA109PLAT,201407287,Paul Pfanders,PA,19480.52,0.0,2025-08-20,ACH Distribution,,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
743,300001MBD,195621676,Daun Boyle,PA,15000.0,15000.0,2025-11-05,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
885,300001MBD,206401373,Frances Vitovsky,PA,12639.65,0.0,2025-09-18,ACH Distribution,,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
938,300001MBD,195621676,Daun Boyle,PA,5000.0,5000.0,2025-09-03,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1000,300001MBD,167523906,Paul O'Hara III,PA,2750.0,2750.0,2025-08-01,ACH Distribution,3.0,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
1158,300003MBDII,221847676,Maria Ayala-McDonald,DE,653.6,653.6,2025-07-23,Wire Distribution,7.0,,...,False,False,1,,no_term_date_under_55_in_txn_year,UPDATE_1099,match_needs_correction,False,1,
1159,300003MBDII,221847676,Maria Ayala-McDonald,DE,11111.11,11111.11,2025-07-08,ACH Distribution,7.0,,...,False,False,1,,no_term_date_under_55_in_txn_year,UPDATE_1099,match_needs_correction,False,1,
1169,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-11-06,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1180,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-10-08,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1198,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-09-03,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,


In [13]:
# Cell 13 — Review analysis and logic engine for rows were age at distribution >= 54 years old

age_taxcode[age_taxcode["age_at_distribution_year"].ge(54.0)].head(15)

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_txn_year,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2
240,IRA109PLAT,201407287,Paul Pfanders,PA,19480.52,0.0,2025-08-20,ACH Distribution,,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
743,300001MBD,195621676,Daun Boyle,PA,15000.0,15000.0,2025-11-05,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
885,300001MBD,206401373,Frances Vitovsky,PA,12639.65,0.0,2025-09-18,ACH Distribution,,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
938,300001MBD,195621676,Daun Boyle,PA,5000.0,5000.0,2025-09-03,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1000,300001MBD,167523906,Paul O'Hara III,PA,2750.0,2750.0,2025-08-01,ACH Distribution,3.0,,...,True,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,
1169,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-11-06,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1180,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-10-08,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1198,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-09-03,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1201,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-08-08,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,
1209,IRA66PLAT,192644776,Constance Testa,PA,1625.0,1625.0,2025-08-05,ACH Distribution,1.0,,...,True,False,2,,no_term_date_55_plus_in_txn_year,UPDATE_1099,match_needs_correction,False,2,


'--------------------------------  Test Quick Export to Excel File --------------------------------'

In [None]:
# Cell 14 — Use quick report export for manual DataFrame output to Excel for stakeholders

from src.export_utils import write_df_excel

path = write_df_excel(age_taxcode, filename_prefix="export_trad_distribs")

print(f"Export was successful!\nFile path: {path}")

Export was successful!
File path: /Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline/reports/outputs/export_trad_distribs_20251219_155827.xlsx


'--------------------------------  Test Official Build/Export to Excel Correction File --------------------------------'

In [None]:
# Cell 15 — Use build_correction_file module to build the 'official' correction file

#Build Excel correction file (same build as inherited corrections engine)
age_correction_df = build_correction_dataframe(age_matches)

age_correction_df.shape

(326, 12)

Notes:
- Currently the age_correction DataFrame is finding all discrepancy between G and codes 1, 2 or G.
    - Age correction analysis is working, however code G is not based on age but in distribution type (G -> Rollovers).
    - We need to filter out distributions that are G (Traditional rollover) and H (Roth rollover) and exclude plans that are inherited since these are always code 4.
- Some tax codes are two digits (like '11'), I need to change the logi to extract 1 or two tax code digits.

In [None]:
# Cell 16 — Use build_correction_file module to create/write/export the 'official' correction file

# 2) Write it to Excel with an auto-generated timestamped name
output_path = write_correction_file(age_correction_df)

# Run write_correction_file() function
output_path

PosixPath('/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline/reports/samples/correction_file_20251219_095247.xlsx')