In [47]:
# Install required packages
%pip install chunkr_ai plotly pandas beautifulsoup4 lxml pydantic 


Note: you may need to restart the kernel to use updated packages.


In [32]:
from chunkr_ai import Chunkr
from chunkr_ai.models import Configuration, SegmentProcessing, GenerationConfig

chunkr = Chunkr()

In [42]:
config = Configuration(
    segment_processing=SegmentProcessing(
        Table=GenerationConfig(
            llm="Generate a description of the table"
        ),
    )
)
# task = await chunkr.upload("https://www.ed.gov/sites/ed/files/about/overview/budget/budget24/24action.xlsx")
task = await chunkr.upload("https://data.ed.gov/dataset/3683c899-b7b5-4ad4-9b5d-152d197ab389/resource/f5d00bcf-b58e-4914-ba29-a03440e63a9f/download/20action.xlsx", config=config)


In [None]:
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO

def parse_table_html(html_content):
    """
    Convert Chunkr HTML table to pandas DataFrame
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    thead = soup.find('thead')
    
    if thead:
        num_header_rows = len(thead.find_all('tr'))
        header_param = list(range(num_header_rows)) if num_header_rows > 1 else 0
    else:
        header_param = 0
    
    tables = pd.read_html(StringIO(html_content), header=header_param)
    return tables[0]

In [96]:
from pydantic import BaseModel, Field, ConfigDict
from chunkr_ai.models import Segment
import uuid

class Table(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    
    id: str = Field(default_factory=lambda: str(uuid.uuid4()))
    segment: Segment
    df: pd.DataFrame

tables = [
    Table(segment=segment, df=parse_table_html(segment.content))
    for chunk in task.output.chunks
    for segment in chunk.segments
    if segment.segment_type == "Table"
]

# Print headers for each table
for i, table in enumerate(tables):
    print(f"\nTable {i+1} Headers:")
    print(table.df.columns.tolist())



Table 1 Headers:
[('Education for the Disadvantaged', 'Program'), ('Unnamed: 1_level_0', 'Cat Code'), ('Unnamed: 2_level_0', '2019 Appropriation'), ('Unnamed: 3_level_0', "2020 President's Budget"), ('Unnamed: 4_level_0', '2020 House'), ('Unnamed: 5_level_0', '2020 Senate Mark'), ('Unnamed: 6_level_0', '2020 Appropriation'), ('Unnamed: 7_level_0', '2020 Appropriation Compared to 2019 Appropriation Amount'), ('Unnamed: 8_level_0', '2020 Appropriation Compared to 2019 Appropriation Percent'), ('Unnamed: 9_level_0', "2020 Appropriation Compared to 2020 President's Budget Amount"), ('Unnamed: 10_level_0', "2020 Appropriation Compared to 2020 President's Budget Percent")]

Table 2 Headers:
[('Impact Aid (ESEA VII)', 'Program'), ('Unnamed: 1_level_0', 'Cat Code'), ('Unnamed: 2_level_0', '2019 Appropriation'), ('Unnamed: 3_level_0', "2020 President's Budget"), ('Unnamed: 4_level_0', '2020 House'), ('Unnamed: 5_level_0', '2020 Senate Mark'), ('Unnamed: 6_level_0', '2020 Appropriation'), ('Unn

In [103]:
class ProgramTotal(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)

    program: str
    row: pd.Series 

program_data: list[ProgramTotal] = []

for table in tables:
    df = table.df
    for index, row in df.iterrows():
        if row[0] == "Total" or row[0] == "Total, Appropriation":
            program_data.append(ProgramTotal(program=row.index[0][0], row=row))

for data in program_data:
    print(data.program)
    print(data.row)
    print("-" * 50)

Education for the Disadvantaged
Education for the Disadvantaged  Program                                                           Total, Appropriation
Unnamed: 1_level_0               Cat Code                                                                             D
Unnamed: 2_level_0               2019 Appropriation                                                          16543790.0
Unnamed: 3_level_0               2020 President's Budget                                                     16376790.0
Unnamed: 4_level_0               2020 House                                                                  17563802.0
Unnamed: 5_level_0               2020 Senate Mark                                                            16543790.0
Unnamed: 6_level_0               2020 Appropriation                                                          16996790.0
Unnamed: 7_level_0               2020 Appropriation Compared to 2019 Appropriation Amount                        453000
Unnamed: