In [16]:
import pandas as pd
import numpy as np
import os
import unstructured
from unstructured.partition.xlsx import partition_xlsx

In [13]:
help(unstructured.partition.partition_xlsx)

AttributeError: module 'unstructured.partition' has no attribute 'partition_xlsx'

In [17]:
help(partition_xlsx)

Help on function partition_xlsx in module unstructured.partition.xlsx:

partition_xlsx(filename: 'Optional[str]' = None, file: 'Optional[IO[bytes]]' = None, metadata_filename: 'Optional[str]' = None, include_metadata: 'bool' = True, infer_table_structure: 'bool' = True, languages: 'Optional[list[str]]' = ['auto'], detect_language_per_element: 'bool' = False, metadata_last_modified: 'Optional[str]' = None, include_header: 'bool' = False, find_subtable: 'bool' = True, date_from_file_object: 'bool' = False, starting_page_number: 'int' = 1, **kwargs: 'Any') -> 'list[Element]'
    Partitions Microsoft Excel Documents in .xlsx format into its document elements.

    Parameters
    ----------
    filename
        A string defining the target filename path.
    file
        A file-like object using "rb" mode --> open(filename, "rb").
    include_metadata
        Determines whether or not metadata is included in the output.
    infer_table_structure
        If True, any Table elements that are 

In [19]:
excel_partition = partition_xlsx('raw_files/CMS SEAS - Clean PB_KH.xlsx')

In [3]:
test_filepath = "raw_files/CMS SEAS - Clean PB_KH.xlsx"

In [3]:
def search_files(path):
    if os.path.isdir(path):
        files = []
        for file in os.listdir(path):
            if file.endswith(".xlsx"):
                files.append(os.path.join(path, file))
        return files
    elif os.path.isfile(path) and path.endswith(".xlsx"):
        return [path]
    else:
        return []

In [8]:
files_list = search_files(test_filepath)

In [9]:
files_list

['raw_files/CMS SEAS - Clean PB_KH.xlsx']

In [11]:
def parse_excel_sheet(file, sheet_name=0, threshold=3):
    '''parses multiple tables from an excel sheet into multiple data frame objects. 
    Returns [dfs, df_mds], where dfs is a list of data frames and df_mds their potential associated metadata'''
    xl = pd.ExcelFile(file)
    entire_sheet = xl.parse(sheet_name=sheet_name)

    # count the number of non-Nan cells in each row and then the change in that number between adjacent rows
    n_values = np.logical_not(entire_sheet.isnull()).sum(axis=1)
    n_values_deltas = n_values[1:] - n_values[:-1].values

    # define the beginnings and ends of tables using delta in n_values
    table_beginnings = n_values_deltas > threshold
    table_beginnings = table_beginnings[table_beginnings].index
    table_endings = n_values_deltas < -threshold
    table_endings = table_endings[table_endings].index
    if len(table_beginnings) < len(table_endings) or len(table_beginnings) > len(table_endings)+1:
        raise BaseException('Could not detect equal number of beginnings and ends')

    # look for metadata before the beginnings of tables
    md_beginnings = []
    for start in table_beginnings:
        md_start = n_values.iloc[:start][n_values==0].index[-1] + 1
        md_beginnings.append(md_start)

    # make data frames
    dfs = []
    df_mds = []
    for ind in range(len(table_beginnings)):
        start = table_beginnings[ind]+1
        if ind < len(table_endings):
            stop = table_endings[ind]
        else:
            stop = entire_sheet.shape[0]
        df = xl.parse(sheet_name=sheet_name, skiprows=start, nrows=stop-start)
        dfs.append(df)

        md = xl.parse(sheet_name=sheet_name, skiprows=md_beginnings[ind], nrows=start-md_beginnings[ind]-1).dropna(axis=1)
        df_mds.append(md)
    return dfs, df_mds

In [13]:
parse_excel_sheet(files_list[0])

IndexError: index -1 is out of bounds for axis 0 with size 0

In [1]:
import pandas as pd
import os

def find_tables(sheet_df):
    tables = []
    table = None
    current_row_length = 0
    
    for index, row in sheet_df.iterrows():
        non_null_count = row.count()
        
        if non_null_count == 0:  # Skip rows with all nulls
            continue
        
        if table is None:
            current_row_length = non_null_count
            table = {'header': row.index[row.notna()].tolist(), 'data': [row.dropna().tolist()]}
        elif non_null_count == current_row_length:
            table['data'].append(row.dropna().tolist())
        else:
            if table is not None:
                tables.append(table)
                table = None
                
            if non_null_count > 0:
                current_row_length = non_null_count
                table = {'header': row.index[row.notna()].tolist(), 'data': [row.dropna().tolist()]}

    if table is not None:
        tables.append(table)
        
    return tables

def save_tables_to_csv(tables, sheet_name, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    for i, table in enumerate(tables):
        table_df = pd.DataFrame(table['data'], columns=table['header'])
        table_df.to_csv(f"{output_folder}/table{i+1}.csv", index=False)

def parse_excel_file(file_path):
    excel_data = pd.ExcelFile(file_path)
    
    for sheet_name in excel_data.sheet_names:
        sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
        tables = find_tables(sheet_df)
        save_tables_to_csv(tables, sheet_name, output_folder=sheet_name)

In [4]:
# Example usage
parse_excel_file(test_filepath)

In [12]:
ls

Excel_Parser.ipynb          [1m[36mSprint Team - Option 2[m[m/
[1m[36mOLM Summary[m[m/                [1m[36mSprint Team - Option 3[m[m/
README.md                   [1m[36mSprint Team - Option 4[m[m/
[1m[36mSprint Team - Base[m[m/         excel_parser.py
[1m[36mSprint Team - FAR 52.217-8[m[m/ [1m[36mfiles[m[m/
[1m[36mSprint Team - Option 1[m[m/     old_main.py


In [61]:
import pandas as pd

def get_row_lengths(sheet_df):
    row_lengths = []
    for index, row in sheet_df.iterrows():
        non_null_count = row.count()
        row_lengths.append((index, non_null_count))
    return row_lengths

def parse_excel_file(file_path):
    excel_data = pd.ExcelFile(file_path)
    sheet_row_lengths = {}
    
    for sheet_name in excel_data.sheet_names:
        sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
        row_lengths = get_row_lengths(sheet_df)
        sheet_row_lengths[sheet_name] = row_lengths
    
    return sheet_row_lengths

# Example usage
sheet_row_lengths = parse_excel_file(test_filepath)

for sheet_name, row_lengths in sheet_row_lengths.items():
    print(f"Sheet: {sheet_name}")
    for row_number, length in row_lengths:
        print(f"Row {row_number}: {length} non-null values")


Sheet: OLM Summary
Row 0: 1 non-null values
Row 1: 5 non-null values
Row 2: 1 non-null values
Row 3: 1 non-null values
Row 4: 0 non-null values
Row 5: 0 non-null values
Row 6: 0 non-null values
Row 7: 1 non-null values
Row 8: 3 non-null values
Row 9: 0 non-null values
Row 10: 1 non-null values
Row 11: 5 non-null values
Row 12: 1 non-null values
Row 13: 1 non-null values
Row 14: 0 non-null values
Row 15: 0 non-null values
Row 16: 0 non-null values
Row 17: 1 non-null values
Row 18: 3 non-null values
Row 19: 0 non-null values
Row 20: 1 non-null values
Row 21: 5 non-null values
Row 22: 1 non-null values
Row 23: 1 non-null values
Row 24: 0 non-null values
Row 25: 0 non-null values
Row 26: 0 non-null values
Row 27: 1 non-null values
Row 28: 3 non-null values
Row 29: 0 non-null values
Row 30: 1 non-null values
Row 31: 5 non-null values
Row 32: 1 non-null values
Row 33: 1 non-null values
Row 34: 0 non-null values
Row 35: 0 non-null values
Row 36: 0 non-null values
Row 37: 1 non-null values
Row

In [63]:
import pandas as pd
import os

def find_tables(sheet_df):
    tables = []
    table = None
    current_row_length = 0
    previous_row_length = 0
    
    for index, row in sheet_df.iterrows():
        non_null_count = row.count()
        
        if non_null_count == 0:  # Skip rows with all nulls
            continue
        
        if table is None:
            current_row_length = non_null_count
            table = {'start_row': index, 'header': row.index[row.notna()].tolist(), 'data': [row.dropna().tolist()]}
        elif non_null_count == current_row_length:
            table['data'].append(row.dropna().tolist())
        else:
            if table is not None and len(table['data']) > 1 and current_row_length == previous_row_length:
                tables.append(table)
            table = None

            previous_row_length = current_row_length
            current_row_length = non_null_count
            table = {'start_row': index, 'header': row.index[row.notna()].tolist(), 'data': [row.dropna().tolist()]}
    
    if table is not None and len(table['data']) > 1 and current_row_length == previous_row_length:
        tables.append(table)
        
    return tables

def save_tables_to_csv(tables, sheet_name, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    for i, table in enumerate(tables):
        table_df = pd.DataFrame(table['data'], columns=table['header'])
        table_df.to_csv(f"{output_folder}/table{i+1}.csv", index=False)

def parse_excel_file(file_path):
    excel_data = pd.ExcelFile(file_path)
    
    for sheet_name in excel_data.sheet_names:
        sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
        tables = find_tables(sheet_df)
        save_tables_to_csv(tables, sheet_name, output_folder=sheet_name)

# Example usage
parse_excel_file(test_filepath)
