In [17]:
import sys
from pathlib import Path

#Path.cwd().parent depends on where you run this code from
#Path(__file__).resolve().parents[1] depends on where this file is located on disk
#
#project_root = .../1099-reconciliation-pipeline
project_root = Path.cwd().parent  # running from notebooks/  folder (cwd = current working directory) - .parent gets us to the folder above
sys.path.append(str(project_root)) #sys.path is a list of folders where python looks for modules
                                    # we add the project root to that list with .append

print(project_root)

/Users/manuelreyes/Desktop/dev/1099_reconciliation_pipeline


In [18]:
import pandas as pd

from src import load_data

In [19]:
from src.config import RAW_DATA_DIR

relius_path = RAW_DATA_DIR / "real_relius_2025.xlsx"
matrix_path = RAW_DATA_DIR / "real_matrix_2025.xlsx"

# Calling the load_data.py functions to load the raw data
# returns uncleaned pandas DataFrames
relius_raw = load_data.load_relius_excel(path=relius_path, use_sample_if_none=False)
matrix_raw = load_data.load_matrix_excel(path=matrix_path, use_sample_if_none=False)

# .shape is an attribute of pandas DataFrames that returns a tuple
# (number of rows, number of columns) - e.g. (1000, 15)
relius_raw.shape, matrix_raw.shape

((253, 374), (60972, 56))

In [None]:
# .columns is an attribute of pandas DataFrames that returns
# the column names as an Index object - we convert it to a list for easier viewing
relius_raw.columns.tolist()
matrix_raw.columns.tolist()

# When [[...]] is used, it returns a DataFrame with only the selected columns
# When [...] is used, it returns a Series (one column, like a vector) with only the selected column
# Here we display the first 5 rows of selected columns from each DataFrame - .head() defaults to 5 rows
matrix_raw[["Client Account", "Participant SSN", "Participant Name", "Participant State", "Gross Amount", "Transaction Date", "Tax Code"]].head()
relius_raw[["PLANID_1", "SSNUM_1", "FIRSTNAM", "LASTNAM", "STATEADDR", "GROSSDISTRAMT", "EXPORTEDDATE", "DISTR1CD"]].head()


Unnamed: 0,PLANID_1,SSNUM_1,FIRSTNAM,LASTNAM,STATEADDR,GROSSDISTRAMT,EXPORTEDDATE,DISTR1CD
0,300004MBD,194420270,David,Symons,PA,3347.62,2025-06-11,4.0
1,300004MBD,197323440,James,McNamee,PA,12623.05,2025-06-11,4.0
2,300004MBD,194420270,David,Symons,PA,2652.38,2025-06-13,4.0
3,300004MBD,194420270,David,Symons,PA,0.0,NaT,
4,300004MBD,197526965,David,Kich,PA,3000.0,2025-06-17,4.0


In [None]:
relius_raw.info()
matrix_raw.info()

relius_raw[["GROSSDISTRAMT"]].describe()
matrix_raw[["Gross Amount"]].describe()