In [17]:
import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline
project_root = Path.cwd().parent  # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print(project_root)

/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


In [18]:
import pandas as pd

from src import load_data

In [None]:
from src.config import RAW_DATA_DIR

relius_path = RAW_DATA_DIR / "real_relius_2025.xlsx"
matrix_path = RAW_DATA_DIR / "real_matrix_2025.xlsx"

# Calling the load_data.py functions to load the raw data
# returns uncleaned pandas DataFrames
relius_raw = load_data.load_relius_excel(path=relius_path, use_sample_if_none=False)
matrix_raw = load_data.load_matrix_excel(path=matrix_path, use_sample_if_none=False)

# .shape is an attribute of pandas DataFrames that returns a tuple of
# (number of rows, number of columns) - e.g. (1000, 15)
relius_raw.shape, matrix_raw.shape

((253, 374), (60972, 56))

'--------------------------------  Test Raw Files  --------------------------------'

In [23]:
# .columns is an attribute of pandas DataFrames that returns
# the column names as an Index object - we convert it to a list for easier viewing
relius_raw.columns.tolist()
matrix_raw.columns.tolist()

['Matrix Account',
 'Account Name',
 'Transaction Date',
 'Transaction Type',
 'Net Amount',
 'Payee Name',
 'Status',
 'Participant SSN',
 'Participant Name',
 'Gross Amount',
 'Fed Wages',
 'Fed Withholding',
 'State Withholding',
 'Tax State',
 'Loan Default Amount',
 'Fed Taxable Amount',
 'Employee Roth Contributions',
 'Tax Code',
 'Tax Code 2',
 'Transaction Id',
 'Client Account',
 'TPA',
 'Recurring',
 'Fed Ref Number',
 'ACH Trace Id',
 'Check Number',
 'Check Date',
 'Reissue',
 'Check Clear Date',
 'Special Handling Carrier',
 'Tracking Number',
 'Participant Address 1',
 'Participant Address 2',
 'Participant City',
 'Participant State',
 'Participant Zip',
 'Payee Address 1',
 'Payee Address 2',
 'Payee City',
 'Payee State',
 'Payee Zip',
 'Payee Bank Account Number',
 'Payee ABA',
 'Payee Bank Name',
 'Additional Notes',
 'Tax Form',
 'Roth Initial Contribution Year',
 'Distribution Type',
 'State Taxable Amount',
 'Taxable Amount Not Determined',
 'Federal Taxing Metho

In [None]:

# When [[...]] is used, it returns a DataFrame with only the selected columns
# When [...] is used, it returns a Series (one column, like a vector) with only the selected column
# Here we display the first 5 rows of selected columns from each DataFrame - .head() defaults to 5 rows
matrix_raw[["Client Account", "Participant SSN", "Participant Name", "Participant State", "Gross Amount", "Transaction Date", "Tax Code"]].head()
relius_raw[["PLANID_1", "SSNUM_1", "FIRSTNAM", "LASTNAM", "STATEADDR", "GROSSDISTRAMT", "EXPORTEDDATE", "DISTR1CD"]].head()


Unnamed: 0,PLANID_1,SSNUM_1,FIRSTNAM,LASTNAM,STATEADDR,GROSSDISTRAMT,EXPORTEDDATE,DISTR1CD
0,300004MBD,194420270,David,Symons,PA,3347.62,2025-06-11,4.0
1,300004MBD,197323440,James,McNamee,PA,12623.05,2025-06-11,4.0
2,300004MBD,194420270,David,Symons,PA,2652.38,2025-06-13,4.0
3,300004MBD,194420270,David,Symons,PA,0.0,NaT,
4,300004MBD,197526965,David,Kich,PA,3000.0,2025-06-17,4.0


In [None]:
# .info() gives a quick summary of the whole table (not the data itself), showing: number of rows and columns, columns names,
# each column's dtype(int, float, object str, datetime, etc.), how many non-null values per column, memory usage.
relius_raw.info()
matrix_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253 entries, 0 to 252
Columns: 374 entries, PLANID to MANLTAXPCT
dtypes: datetime64[ns](8), float64(83), int64(101), object(182)
memory usage: 739.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60972 entries, 0 to 60971
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Matrix Account                  60969 non-null  object 
 1   Account Name                    60968 non-null  object 
 2   Transaction Date                60968 non-null  object 
 3   Transaction Type                60968 non-null  object 
 4   Net Amount                      60968 non-null  float64
 5   Payee Name                      7144 non-null   object 
 6   Status                          60968 non-null  object 
 7   Participant SSN                 7065 non-null   float64
 8   Participant Name                7054 non-null   object 
 9   Gross Amount

In [None]:
# .describe() gives summary statis for numeric columns(default)

# df[[..]] stats as a DataFrame(good for multiple columns or further processing)
# df[..] stats as a Series(one column vector)

relius_raw[["GROSSDISTRAMT"]].describe()
matrix_raw[["Gross Amount"]].describe()

Unnamed: 0,Gross Amount
count,60968.0
mean,6850.697
std,41745.78
min,0.0
25%,68.095
50%,350.0
75%,1400.0
max,2031957.0


Notes regarding df[[...]] and [...]:

- Calling .describe() on a DataFrame [[...]] returns another DataFrame of stats.
    - the result as a DataFrame makes easier to combine with other tables or export.

- Calling .describe() on a Series [...] returns a Series of stats.
    - the result as a series (one column) makes easire just to get stats in a notebook.