In [1]:
# Cell 1 - Tell python where to find the modules to import from

import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline if notebook runs from the repo root or from notebooks/
cwd = Path.cwd()
project_root = cwd if (cwd / "src").exists() else cwd.parent # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print("Project root: ", project_root)

Project root:  /Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


'--------------------------------  Test Merging DataFrames  --------------------------------'

In [2]:
# Cell 2 — Imports, Load & Clean inputs (real paths)

from src.cleaning.clean_relius import clean_relius
from src.cleaning.clean_matrix import clean_matrix
from src.engines.match_planid import reconcile_relius_matrix
from src import load_data
from src.config import RAW_DATA_DIR, USE_SAMPLE_DATA_DEFAULT


if USE_SAMPLE_DATA_DEFAULT:
    matrix_path = None
    relius_path = None
else:
    matrix_path = RAW_DATA_DIR / "real_all_matrix_2025.xlsx"
    relius_path = RAW_DATA_DIR / "real_inherited_relius_2025.xlsx"


# Load raw data to DataFrames
relius_raw = load_data.load_relius_excel(path=relius_path)
matrix_raw = load_data.load_matrix_excel(path=matrix_path)

# Clean DataFrames
relius_clean = clean_relius(relius_raw)
matrix_clean = clean_matrix(matrix_raw)

# Only inherited plans for now:
inherited_plans = ["300004PLAT", "300004MBD", "300004MBDII"]

matched = reconcile_relius_matrix(
    relius_clean,
    matrix_clean,
    plan_ids=inherited_plans,
    apply_business_rules=True,
)

#.shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
print(matched.shape)
matched.head(20)

  - Find the first numeric digits '\d' before the ending 0 and return it


(662, 48)


  matrix_clean = clean_matrix(matrix_raw)
  return pd.to_datetime(series, errors=errors, format=format, dayfirst=dayfirst).dt.date


Unnamed: 0,plan_id,ssn,first_name,last_name,state_relius,gross_amt,exported_date,tax_year,dist_code_1,dist_name,...,match_status,expected_tax_code_1,expected_tax_code_2,code_matches_expected,needs_correction,suggested_tax_code_1,suggested_tax_code_2,correction_reason,action,new_tax_code
0,300004MBD,150406376,,,,151452.18,NaT,,,,...,unmatched_matrix,,,False,False,,,,,
1,300004MBD,150406376,Robert,Wilkins,DE,151522.18,2025-10-15,0.0,G,Rollover,...,unmatched_relius,,,True,False,,,,,
2,300004MBD,159364560,Maria,Cantalupo,PA,10872.69,2025-10-02,0.0,4,RMD ACH,...,match_no_action,4.0,,True,False,,,,,
3,300004MBD,186545616,Maureen,King,PA,25000.0,2025-09-22,0.0,4,Partial Liquidation Net ACH,...,match_no_action,4.0,,True,False,,,,,
4,300004MBD,188368532,,,,1699.02,NaT,,,,...,unmatched_matrix,,,False,False,,,,,
5,300004MBD,188368532,Deborah,Gryshuk,PA,1709.02,2025-10-03,0.0,7,Recurring Check Net,...,unmatched_relius,,,True,False,,,,,
6,300004MBD,194420270,,,,669.52,NaT,,,,...,unmatched_matrix,,,False,False,,,,,
7,300004MBD,194420270,David,Symons,PA,2652.38,2025-06-13,0.0,4,Partial Liquidation Gross ACH,...,match_needs_correction,4.0,,False,True,4.0,,inherited_cash_expected_4,UPDATE_1099,4.0
8,300004MBD,194420270,,,,2678.1,NaT,,,,...,unmatched_matrix,,,False,False,,,,,
9,300004MBD,194420270,David,Symons,PA,3347.62,2025-06-11,0.0,4,RMD ACH,...,unmatched_relius,,,True,False,,,,,


Notes:
- After matching process is executed (and only in inherited plans):
    - A 662 rows and 48 columns Matched DataFrame is generated
    - This new DataFrame has "both matched" rows and "only Relius" and "only Matrix" rows also.
- The data is indexed correctly, however the matched produced some "duplicates" in match_status:
    - Since matching keys are gross amt, SSN and plan ID we have monthly systematic distributions that are the same in these three keys.
    - This producing that a distribution from Relius in month X is being matched with all distributions in Matrix in months X, Y, Z...etc.
    - 1 Distribution is matched and defined if needs correction or not, however the others are "date_out_of_range"
    - Will need to review transaction_id Series specifically to confirm duplicates. 

In [3]:
# Cell 3 — Identify duplication issue

# How many times each Matrix transaction id appears in the merged df
tx_id_counts = matched["transaction_id"].value_counts()

# Show only those that appear more than once
tx_id_counts[tx_id_counts > 1].head(10)

transaction_id
41521112    6
44241680    6
40587444    6
41521111    6
42348067    6
43303285    6
44241671    6
40147199    6
40587443    6
41521110    6
Name: count, dtype: int64

Notes:
- transaction_id is a unique transaction number per distribution in Matrix.
- We have duplicates for several transaction_id values in our matched DataFrame (these must be the montly recurrings distributions with same gross amt, plan ID and SSN)

- Need to implement a quick solution in Notebooks to test if applying lag days between Distribution exiting Relius and arriving to Matrix works to avoid dups.

'--------------------------------  Filter Matching Rows  --------------------------------'

In [4]:
# Cell 4 — Implement transaction_date vs export_date tolerance to filter matched transactions

matches_in_range = matched[
    (matched["_merge"] == "both") &
    (matched["date_within_tolerance"])
].copy()

matches_in_range["match_status"].value_counts()

match_status
match_needs_correction    114
match_no_action            40
Name: count, dtype: int64

Notes:
- When we filter matched DataFrame to see rows in both systems and that are date_within_range:
    - We get 114 rows (unique trans id) that needs correction
    - Plus 40 that are perfect match (no correction needed)
- Filter seems to work to get real status for matched rows and fixed the match_status for the monthly recurrings distributions

In [5]:
# Cell 5 — Identify best match

# Sort so the "best" match (smallest lag) is first for each transaction_id
matches_in_range = matches_in_range.sort_values(
    ["transaction_id", "date_lag_days"]
)

# Keep only the first row per transaction_id
primary_matches = matches_in_range.drop_duplicates(
    subset=["transaction_id"],
    keep="first"
)

primary_matches["transaction_id"].value_counts().head(10)

transaction_id
40047551    1
43303283    1
43144293    1
43189338    1
43237729    1
43237730    1
43237731    1
43237753    1
43238069    1
43303284    1
Name: count, dtype: int64

Notes:
- When applying the sort and remove all dups (using transaction_id as subset):
    - we get the original transaction_id row matched (Relius vs Matrix)
    - we get the real match_status if needs corrections or not
    - primary_matched DataFrame keeps same shape. No real data lost

In [6]:
# Cell 6 — New shape after date tolerance filter has been applied

print(primary_matches.shape)
primary_matches[
    [
        "plan_id",
        "first_name",
        "last_name",
        "ssn",
        "gross_amt",
        "exported_date",
        "txn_date",
        "date_lag_days",
        "dist_name",
        "transaction_id",
        "match_status",
        "tax_code_1",
        "tax_code_2",
        "expected_tax_code_1",
        "expected_tax_code_2",
        "suggested_tax_code_1",
        "suggested_tax_code_2",
        "correction_reason",
    ]
].head(20)

(154, 48)


Unnamed: 0,plan_id,first_name,last_name,ssn,gross_amt,exported_date,txn_date,date_lag_days,dist_name,transaction_id,match_status,tax_code_1,tax_code_2,expected_tax_code_1,expected_tax_code_2,suggested_tax_code_1,suggested_tax_code_2,correction_reason
10,300004MBD,James,McNamee,197323440,12623.05,2025-06-11,2025-06-12,1.0,RMD ACH,40047551,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
59,300004MBDII,Gail,Jones,178666831,2352.94,2025-06-11,2025-06-12,1.0,Recurring ACH,40047552,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
102,300004MBDII,Karen,Ely Anderson,191689340,833.33,2025-06-11,2025-06-12,1.0,Recurring ACH,40047553,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
140,300004MBDII,Kenneth,Stoudt,206566368,3333.33,2025-06-11,2025-06-12,1.0,Recurring ACH,40047554,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
203,300004PLAT,Michael,Dalmaso,162542334,1000.0,2025-06-11,2025-06-12,1.0,Recurring ACH,40047555,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
276,300004PLAT,Myra,Brown,164401356,1558.82,2025-06-11,2025-06-12,1.0,Recurring ACH,40047556,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
350,300004PLAT,Timothy,Groth,167686361,1666.67,2025-06-11,2025-06-12,1.0,Recurring ACH,40047557,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
387,300004PLAT,LeeAnn,Ahern,169481068,6957.94,2025-06-11,2025-06-12,1.0,RMD ACH,40047558,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
486,300004PLAT,Joanne,Piombino,188608887,853.25,2025-06-11,2025-06-12,1.0,Recurring ACH,40047559,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4
563,300004PLAT,Brian,Stancavage,200504595,528.89,2025-06-11,2025-06-12,1.0,Recurring ACH,40047560,match_needs_correction,7,,4,,4.0,,inherited_cash_expected_4


Note:
- Verified: each row corresponds to a unique transaction_id value now.
- match_status now presents a 'real' status, since the transactions are date_within_tolerance.

In [7]:
# Cell 7 — Review specific case for recurring distrib

cols = [
    
    "plan_id",
    "first_name",
    "last_name",
    "ssn",
    "gross_amt",
    "exported_date",
    "txn_date",
    "date_lag_days",
    "dist_name",
    "transaction_id",
    "match_status",
    "tax_code_1",
    "tax_code_2",
    "expected_tax_code_1",
    "expected_tax_code_2",
    "suggested_tax_code_1",
    "suggested_tax_code_2",
    "correction_reason",
]

ssn_mask = primary_matches["ssn"].astype(str).str.strip() == "197526965"

participant = primary_matches.loc[ssn_mask, cols]

participant.head(10)

Unnamed: 0,plan_id,first_name,last_name,ssn,gross_amt,exported_date,txn_date,date_lag_days,dist_name,transaction_id,match_status,tax_code_1,tax_code_2,expected_tax_code_1,expected_tax_code_2,suggested_tax_code_1,suggested_tax_code_2,correction_reason
17,300004MBD,David,Kich,197526965,3000.0,2025-06-17,2025-06-17,0.0,Partial Liquidation Gross ACH,40191468,match_needs_correction,7,,4,,4,,inherited_cash_expected_4
22,300004MBD,David,Kich,197526965,3000.0,2025-07-03,2025-07-03,0.0,Recurring ACH,40587726,match_needs_correction,7,,4,,4,,inherited_cash_expected_4
27,300004MBD,David,Kich,197526965,3000.0,2025-08-05,2025-08-05,0.0,Recurring ACH,41521394,match_needs_correction,7,,4,,4,,inherited_cash_expected_4
32,300004MBD,David,Kich,197526965,3000.0,2025-09-03,2025-09-03,0.0,Recurring ACH,42348394,match_needs_correction,7,,4,,4,,inherited_cash_expected_4
37,300004MBD,David,Kich,197526965,3000.0,2025-10-03,2025-10-03,0.0,Recurring ACH,43238069,match_needs_correction,7,,4,,4,,inherited_cash_expected_4
42,300004MBD,David,Kich,197526965,3000.0,2025-11-04,2025-11-04,0.0,Recurring ACH,44178501,match_needs_correction,7,,4,,4,,inherited_cash_expected_4


Notes:
- Finding a specific participant with monthly Distributions we confirmed all recurring distributions are matched and with their unique transaction_id.
- No Duplicates in our DataFrame.

- Below we run the same extraction, but using boolean indexing instead of loc[mask,..] function. 

In [8]:
# Cell 8 — Testing boolean extraction

primary_matches[ssn_mask].head(10)

Unnamed: 0,plan_id,ssn,first_name,last_name,state_relius,gross_amt,exported_date,tax_year,dist_code_1,dist_name,...,match_status,expected_tax_code_1,expected_tax_code_2,code_matches_expected,needs_correction,suggested_tax_code_1,suggested_tax_code_2,correction_reason,action,new_tax_code
17,300004MBD,197526965,David,Kich,PA,3000.0,2025-06-17,0,4,Partial Liquidation Gross ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
22,300004MBD,197526965,David,Kich,PA,3000.0,2025-07-03,0,4,Recurring ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
27,300004MBD,197526965,David,Kich,PA,3000.0,2025-08-05,0,7,Recurring ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
32,300004MBD,197526965,David,Kich,PA,3000.0,2025-09-03,0,7,Recurring ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
37,300004MBD,197526965,David,Kich,PA,3000.0,2025-10-03,0,7,Recurring ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
42,300004MBD,197526965,David,Kich,PA,3000.0,2025-11-04,0,7,Recurring ACH,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4


Notes:
- dist_code_1 is coming from Relius and it has tax code 4 however "match_needs_correction" is triggered.
- Need to run another DataFrame below to compart tax codes in Relius (dist_code_1) vs Matrix (tax_code_1).
- Matrix report is priority since they sent the 1099 to participants directly. 

'--------------------------------  Run Correcion File  --------------------------------'

'---------- primary_matches tests ----------'

In [9]:
# Cell 9 — Testing DataFrame ready for buid correction file

primary_matches["match_status"].value_counts()

match_status
match_needs_correction    114
match_no_action            40
Name: count, dtype: int64

In [10]:
# Cell 10 — Test 'action' Series is indexed correctly

primary_matches["action"].value_counts(dropna=False)

action
UPDATE_1099    114
<NA>            40
Name: count, dtype: int64

Notes:
- Test "action" column has the correct values and it is indexed correctly.

In [11]:
# Cell 11 — Review tax codes between Matrix and Relius transactions

expected_corrections = primary_matches[
    (primary_matches["match_status"] == "match_needs_correction")
    & primary_matches["suggested_tax_code_1"].notna()
]

# Add tax_code_1 columns to the see tax code differences between Relius and Matrix
custom_cols = [col for col in expected_corrections.columns]
custom_cols.insert(9, "tax_code_1")                             # .insert(index, value) -> insert value in a specific index position in the List.
custom_cols.insert(7, "txn_date")
custom_cols.remove("state_relius")

print(expected_corrections.shape)
expected_corrections[custom_cols].head(20)

(114, 48)


Unnamed: 0,plan_id,ssn,first_name,last_name,gross_amt,exported_date,txn_date,tax_year,dist_code_1,tax_code_1,...,match_status,expected_tax_code_1,expected_tax_code_2,code_matches_expected,needs_correction,suggested_tax_code_1,suggested_tax_code_2,correction_reason,action,new_tax_code
10,300004MBD,197323440,James,McNamee,12623.05,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
59,300004MBDII,178666831,Gail,Jones,2352.94,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
102,300004MBDII,191689340,Karen,Ely Anderson,833.33,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
140,300004MBDII,206566368,Kenneth,Stoudt,3333.33,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
203,300004PLAT,162542334,Michael,Dalmaso,1000.0,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
276,300004PLAT,164401356,Myra,Brown,1558.82,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
350,300004PLAT,167686361,Timothy,Groth,1666.67,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
387,300004PLAT,169481068,LeeAnn,Ahern,6957.94,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
486,300004PLAT,188608887,Joanne,Piombino,853.25,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4
563,300004PLAT,200504595,Brian,Stancavage,528.89,2025-06-11,2025-06-12,0,4,7,...,match_needs_correction,4,,False,True,4,,inherited_cash_expected_4,UPDATE_1099,4


Notes:
- Verified: Filtering primary_matches to get rows that needs correction only in expected_corrections DataFrame (from Matrix tax code).
- A discrepancy has been identified where Relius tax code is 4 (correct) but matrix tax code is 7 (wrong and 1099 needs to be corrected).
- Will review with team to identify the issue!

In [12]:
# Cell 12 — Build correction DataFrame

from importlib import reload
import src.outputs.build_correction_file as bcf
reload(bcf)

from src.outputs.build_correction_file import build_correction_dataframe

# 1) Build the correction dataframe
#
# primary_matches is your filtered/cleaned matches DataFrame
corrections_df = build_correction_dataframe(primary_matches)

corrections_df.head()

Unnamed: 0,Transaction Id,Transaction Date,Participant SSN,Participant Name,Matrix Account,Current Tax Code 1,Current Tax Code 2,New Tax Code,New Taxable Amount,New First Year contrib,Reason,Action
0,44241666,2025-11-05,73467632,Stephen Williams,07P6LM4C,7,,4,,,inherited_cash_expected_4,UPDATE_1099
1,44241667,2025-11-05,160542844,Brian Markle,07P6LM4C,7,,4,,,inherited_cash_expected_4,UPDATE_1099
2,44241668,2025-11-05,160544058,Christopher Markle,07P6LM4C,7,,4,,,inherited_cash_expected_4,UPDATE_1099
3,40047555,2025-06-12,162542334,Michael Dalmaso,07P6LM4C,7,,4,,,inherited_cash_expected_4,UPDATE_1099
4,40587442,2025-07-03,162542334,Michael Dalmaso,07P6LM4C,7,,4,,,inherited_cash_expected_4,UPDATE_1099


Notes:
- Verified: build corrections_df DataFrame using columns from Matrix corrections file template with correct data is successful.

In [13]:
# Cell 13 — Generate Excel correction file and save in Path

from src.outputs.build_correction_file import write_correction_file

# 2) Write it to Excel with an auto-generated timestamped name
output_path = write_correction_file(corrections_df, engine="match_planid")

# Run write_correction_file() function
print(f"File saved successfully in: {output_path}")

File saved successfully in: /Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline/reports/outputs/match_planid/correction_file_20260103_170203.xlsx


Notes:
- Verified: file was created in desired path and with correct data = transaction that need tax code correction.