In [1]:
# Cell 1 - Tell python where to find the modules to import from

import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline if notebook runs from the repo root or from notebooks/
cwd = Path.cwd()
project_root = cwd if (cwd / "src").exists() else cwd.parent # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print("Project root: ", project_root)

Project root:  /Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


'--------------------------------  Test Load Data and Clean DataFrames  --------------------------------'

In [None]:
# Cell 2 — Imports, Load & Clean inputs (real paths)

from src.core import load_data
from src.cleaning.clean_matrix import clean_matrix
from src.cleaning.clean_relius_demo import clean_relius_demo
from src.engines.age_taxcode_analysis import run_age_taxcode_analysis
from src.outputs.build_correction_file import build_correction_dataframe, write_correction_file

from src.config import RAW_DATA_DIR, USE_SAMPLE_DATA_DEFAULT


if USE_SAMPLE_DATA_DEFAULT:
    matrix_path = None
    relius_demo_path = None
else:
    matrix_path = RAW_DATA_DIR / "real_all_matrix_2025.xlsx"
    relius_demo_path = RAW_DATA_DIR / "real_demo_relius_2025.xlsx"


# Load  and clean Matrix raw data
matrix_raw = load_data.load_matrix_excel(path=matrix_path)
matrix_clean = clean_matrix(matrix_raw)

# Load and clean Relius demo raw data
relius_demo_raw = load_data.load_relius_demo_excel(path=relius_demo_path)
relius_demo_clean = clean_relius_demo(relius_demo_raw)


#.shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
print("\n")
print(f"Matrix Cleaned DataFrame shape: {matrix_clean.shape}")
print(f"Relius Demo Cleaned DataFrame shape: {relius_demo_clean.shape}")



Matrix Cleaned DataFrame shape: (86, 21)
Relius Demo Cleaned DataFrame shape: (105, 11)


In [3]:
# Cell 3 — Review Dtype per columns and forst 10 ros of cleaned DataFrame

print(relius_demo_clean.info())
relius_demo_clean.head(10)

<class 'pandas.core.frame.DataFrame'>
Index: 105 entries, 17 to 66
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   plan_id            105 non-null    string 
 1   ssn                105 non-null    string 
 2   first_name         105 non-null    string 
 3   last_name          105 non-null    string 
 4   dob                103 non-null    object 
 5   term_date          78 non-null     object 
 6   ssn_valid          105 non-null    boolean
 7   amount_valid       0 non-null      boolean
 8   date_valid         78 non-null     boolean
 9   code_1099r_valid   0 non-null      boolean
 10  validation_issues  105 non-null    object 
dtypes: boolean(4), object(3), string(4)
memory usage: 7.4+ KB
None


Unnamed: 0,plan_id,ssn,first_name,last_name,dob,term_date,ssn_valid,amount_valid,date_valid,code_1099r_valid,validation_issues
17,300004MBD,191518845,Timothy,Cohen,1980-12-09,2023-08-15,True,,True,,[]
29,300004MBD,367249438,Stephanie,Carter,1999-10-20,2014-09-14,True,,True,,[]
65,300004MBD,440573705,Robert,Daniels,1950-06-24,NaT,True,,,,[]
71,300004MBD,444318609,Jean,Jones,1987-01-21,2019-12-28,True,,True,,[]
103,300004MBD,444556666,Noah,Garcia,NaT,2021-05-01,True,,True,,[]
53,300004MBD,483174065,Regina,Pineda,1969-03-28,2021-05-26,True,,True,,[]
40,300004MBD,536524271,Michael,Cooley,1980-09-18,2022-04-16,True,,True,,[]
72,300004MBD,545615549,Kimberly,Barnes,2004-04-04,2015-06-10,True,,True,,[]
85,300004MBD,559713672,Tanya,Schmidt,1983-11-01,2023-09-05,True,,True,,[]
95,300004MBD,573768871,Brittney,Mitchell,1951-09-21,2013-04-02,True,,True,,[]


'--------------------------------  Test Merging DataFrames and Analysis Engine --------------------------------'

In [4]:
# Cell 4 — Run Merging and Analysis Engine

age_matches = run_age_taxcode_analysis(matrix_clean, relius_demo_clean)

age_matches.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60 entries, 0 to 85
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   plan_id                         60 non-null     string 
 1   ssn                             60 non-null     string 
 2   participant_name                60 non-null     object 
 3   state                           60 non-null     string 
 4   gross_amt                       60 non-null     float64
 5   fed_taxable_amt                 60 non-null     float64
 6   txn_date                        60 non-null     object 
 7   txn_method                      60 non-null     string 
 8   tax_code_1                      60 non-null     string 
 9   tax_code_2                      27 non-null     string 
 10  tax_form                        60 non-null     object 
 11  dist_type                       60 non-null     string 
 12  roth_initial_contribution_year  0 non-null 

In [5]:
# Cell 5 — Review tax codes in Series (1 or 2 digits)

age_matches[["tax_code_1"]].value_counts()

tax_code_1
1             30
7             29
ZZ             1
Name: count, dtype: int64

In [6]:
# Cell 6 — Review dtype of age at distribution Series and first at merged and analized DataFrame

print("age_at_distribution_year dtype: ", age_matches["age_at_distribution_year"].dtype)
age_matches.head(10)

age_at_distribution_year dtype:  Float64


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code
0,300004MBDII,182525755,Megan Hall,AR,17381.93,17381.93,2024-10-28,ACH,7,G,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
1,300004MBD,673043377,Michelle Douglas,FM,7429.56,7429.56,2024-11-09,ACH,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
3,400001ABC,605511979,Dawn Frank,SC,7655.57,7655.57,2024-08-28,Check,1,,...,False,1.0,,,,match_no_action,True,,,
4,300004PLAT,596839385,Benjamin Snyder,TX,4615.94,4615.94,2024-09-04,Check,1,,...,True,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
7,300004MBDII,159710067,Tamara Swanson,WI,6636.42,6636.42,2024-03-22,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
10,400001ABC,415943638,Michelle Gregory,AL,14670.15,14670.15,2024-08-15,Check,7,G,...,True,7.0,,,,match_no_action,True,,,
11,300004PLAT,812610447,Lisa Barnes,LA,8952.82,8952.82,2024-04-22,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
12,300004MBD,191518845,Timothy Cohen,ME,5625.02,5625.02,2024-10-29,Check,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
13,300004MBDII,464357647,Kristen Douglas,VI,4205.62,4205.62,2024-10-14,ACH,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
16,300004MBDII,116340304,Alexa Hartman,NM,5492.41,5492.41,2024-05-16,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,


In [7]:
# Cell 7 — Test no Roth plans in merged DataFrame

non_roth_test = age_matches[age_matches["plan_id"].str.startswith("300005") | age_matches["plan_id"].str.endswith("R")]
assert non_roth_test.empty, "WE HAVE ROTH DISBURSEMENTS!"

print("No Roth Disbursements in DataFrame!")

non_roth_test.head()

No Roth Disbursements in DataFrame!


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code


In [8]:
# Cell 8 — Test rows with G tax code are excluded from analysis engine

rollover_test = age_matches[age_matches["tax_code_1"].eq("G")]
rollover_test.head()

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code


In [9]:
# Cell 9 — Review rows that match status is not 'match_no_action'

logic_test = age_matches[age_matches["match_status"] != "match_no_action"]

logic_test.head(15)

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code
0,300004MBDII,182525755,Megan Hall,AR,17381.93,17381.93,2024-10-28,ACH,7,G,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
1,300004MBD,673043377,Michelle Douglas,FM,7429.56,7429.56,2024-11-09,ACH,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
4,300004PLAT,596839385,Benjamin Snyder,TX,4615.94,4615.94,2024-09-04,Check,1,,...,True,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
7,300004MBDII,159710067,Tamara Swanson,WI,6636.42,6636.42,2024-03-22,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
11,300004PLAT,812610447,Lisa Barnes,LA,8952.82,8952.82,2024-04-22,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
12,300004MBD,191518845,Timothy Cohen,ME,5625.02,5625.02,2024-10-29,Check,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
13,300004MBDII,464357647,Kristen Douglas,VI,4205.62,4205.62,2024-10-14,ACH,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
16,300004MBDII,116340304,Alexa Hartman,NM,5492.41,5492.41,2024-05-16,Wire,1,,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
18,300004PLAT,230512338,Lindsay Sanchez,VT,16883.89,16883.89,2024-12-17,Wire,7,G,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,
20,300004MBDII,446816002,Meghan Porter,FL,16363.57,16363.57,2024-04-28,Check,7,G,...,False,,,,,excluded_from_age_engine_rollover_or_inherited,False,,,


In [10]:
# Cell 10 — Review values and their counts in 'match_status'

age_matches["match_status"].value_counts()

match_status
excluded_from_age_engine_rollover_or_inherited    41
match_needs_correction                            12
match_no_action                                    7
Name: count, dtype: int64

In [11]:
# Cell 11 — Review values and their counts in 'match_status' per unique SSN

filter_df = age_matches.drop_duplicates(subset="ssn")
print(filter_df["match_status"].value_counts())

filter_df[filter_df["match_status"] == "age_rule_insufficient_data"].head(15)

match_status
excluded_from_age_engine_rollover_or_inherited    41
match_needs_correction                            11
match_no_action                                    6
Name: count, dtype: int64


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code


In [12]:
# Cell 12 — Review rows with 'match_needs_correction' status

age_taxcode = age_matches[age_matches["match_status"].eq("match_needs_correction")]

print(f" CORRECTION rows: {len(age_taxcode)}")
age_taxcode.head(15)

 CORRECTION rows: 12


Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code
28,400001ABC,589061532,Richard Espinoza,OR,6338.13,6338.13,2024-05-06,Check,1,,...,True,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
29,400001ABC,888272020,Brandon Curtis,NM,12609.42,12609.42,2024-07-17,ACH,7,G,...,False,1,,terminated_before_55,UPDATE_1099,match_needs_correction,False,1,,1
34,400001ABC,764006125,Mark Allen,FM,16078.88,16078.88,2024-05-30,ACH,7,G,...,False,1,,terminated_before_55,UPDATE_1099,match_needs_correction,False,1,,1
35,400001ABC,460701992,Pamela Campbell,HI,17440.52,17440.52,2024-12-17,ACH,7,G,...,False,1,,no_term_date_under_55_in_txn_year,UPDATE_1099,match_needs_correction,False,1,,1
42,400001ABC,442797053,Ashley Hall,WV,13006.06,13006.06,2024-09-04,Check,7,G,...,False,1,,terminated_before_55,UPDATE_1099,match_needs_correction,False,1,,1
44,400001ABC,454539137,Alison Shah,NM,6413.41,6413.41,2024-08-11,Check,1,,...,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
48,400001ABC,512311059,Jennifer Davis,VT,12210.4,12210.4,2024-01-12,Check,7,G,...,False,1,,no_term_date_under_55_in_txn_year,UPDATE_1099,match_needs_correction,False,1,,1
52,400001ABC,944463288,Andrew Bell,RI,8788.41,8788.41,2024-05-22,ACH,1,,...,True,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
72,400001ABC,589074078,William Jackson,NE,12084.58,12084.58,2024-09-28,Wire,7,G,...,False,1,,terminated_before_55,UPDATE_1099,match_needs_correction,False,1,,1
75,400001ABC,328488441,Jose Greene,RI,17927.51,17927.51,2024-05-22,Check,7,G,...,False,1,,terminated_before_55,UPDATE_1099,match_needs_correction,False,1,,1


In [13]:
# Cell 13 — Review analysis and logic engine for rows were age at distribution >= 54 years old

age_taxcode[age_taxcode["age_at_distribution_year"].ge(54.0)].head(15)

Unnamed: 0,plan_id,ssn,participant_name,state,gross_amt,fed_taxable_amt,txn_date,txn_method,tax_code_1,tax_code_2,...,attained_55_in_term_year,expected_tax_code_1,expected_tax_code_2,correction_reason,action,match_status,code_matches_expected,suggested_tax_code_1,suggested_tax_code_2,new_tax_code
28,400001ABC,589061532,Richard Espinoza,OR,6338.13,6338.13,2024-05-06,Check,1,,...,True,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
44,400001ABC,454539137,Alison Shah,NM,6413.41,6413.41,2024-08-11,Check,1,,...,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
52,400001ABC,944463288,Andrew Bell,RI,8788.41,8788.41,2024-05-22,ACH,1,,...,True,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
80,400001ABC,222334444,Liam Patel,TX,8522.2,8522.2,2024-02-05,Wire,1,,...,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7
85,400001ABC,222334444,Liam Patel,TX,3000.0,9000.0,2024-02-20,Wire,ZZ,,...,False,7,,age_59_5_or_over_normal_distribution,UPDATE_1099,match_needs_correction,False,7,,7


'--------------------------------  Test Quick Export to Excel File --------------------------------'

In [None]:
# Cell 14 — Use quick report export for manual DataFrame output to Excel for stakeholders

from src.outputs.export_utils import write_df_excel

path = write_df_excel(age_taxcode, filename_prefix="export_trad_distribs")

print(f"Export was successful!\nFile path: {path}")

'--------------------------------  Test Official Build/Export to Excel Correction File --------------------------------'

In [14]:
# Cell 15 — Use build_correction_file module to build the 'official' correction file

#Build Excel correction file (same build as inherited corrections engine)
age_correction_df = build_correction_dataframe(age_matches)

age_correction_df.shape

(12, 12)

Notes:
- Currently the age_correction DataFrame is finding all discrepancy between G and codes 1, 2 or G.
    - Age correction analysis is working, however code G is not based on age but in distribution type (G -> Rollovers).
    - We need to filter out distributions that are G (Traditional rollover) and H (Roth rollover) and exclude plans that are inherited since these are always code 4.
- Some tax codes are two digits (like '11'), I need to change the logi to extract 1 or two tax code digits.

In [15]:
# Cell 16 — Use build_correction_file module to create/write/export the 'official' correction file

# 2) Write it to Excel with an auto-generated timestamped name
output_path = write_correction_file(age_correction_df, engine="age_taxcode")

# Run write_correction_file() function
output_path

PosixPath('/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline/reports/samples/age_taxcode/correction_file_20260105_115852.xlsx')