In [1]:
from helpers import *
csv = CSVDataExtractor(r"C:\Users\MAB\Downloads\PWSAutomate\uploads\EFS518368.csv")


In [2]:
import pandas as pd

def read_csv_data(file_path):
    """
    Reads the CSV data from the given file path.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The dataframe containing the CSV data.
    """
    return pd.read_csv(file_path, header=None)

def find_start_row(data, start_text):
    """
    Finds the start row index where the given text is found.

    Parameters:
    data (pd.DataFrame): The dataframe containing the CSV data.
    start_text (str): The text to search for.

    Returns:
    int: The index of the start row.
    """
    return data[data.apply(lambda row: row.astype(str).str.contains(start_text, case=False, na=False, regex=False).any(), axis=1)].index.min()

def is_header_row(row, header_keywords):
    """
    Checks if a row contains the header keywords.

    Parameters:
    row (pd.Series): The row to check.
    header_keywords (list): The list of header keywords.

    Returns:
    bool: True if the row contains the header keywords, False otherwise.
    """
    return all(keyword in row.tolist() for keyword in header_keywords)

def find_header_indices(data, header_keywords):
    """
    Finds the indices of the rows where the header appears.

    Parameters:
    data (pd.DataFrame): The dataframe containing the CSV data.
    header_keywords (list): The list of header keywords.

    Returns:
    list: The list of indices where the header appears.
    """
    return data[data.apply(is_header_row, axis=1, header_keywords=header_keywords)].index.tolist()

def extract_main_details(table):
    """
    Extracts the main details from the second row of the table.

    Parameters:
    table (pd.DataFrame): The dataframe containing the table data.

    Returns:
    dict: The dictionary containing the main details.
    """
    main_details = {k: v for k, v in table.iloc[0].to_dict().items() if pd.notna(v)}
    last_key = list(main_details.keys())[-1]
    main_details["ANALYSIS NUMBER"] = main_details.pop(last_key)
    return main_details

def extract_sub_table(table, sub_table_start_text):
    """
    Extracts the sub-table that starts with the given text.

    Parameters:
    table (pd.DataFrame): The dataframe containing the table data.
    sub_table_start_text (str): The text indicating the start of the sub-table.

    Returns:
    pd.DataFrame: The dataframe containing the sub-table.
    """
    sub_table_start_row = table[table.apply(lambda row: row.astype(str).str.contains(sub_table_start_text, case=False, na=False, regex=False).any(), axis=1)].index.min()

    if pd.isna(sub_table_start_row):
        return pd.DataFrame()  # Return an empty dataframe if the sub-table start text is not found

    sub_table = table.iloc[sub_table_start_row + 2:].reset_index(drop=True)
    display(sub_table)
    if sub_table.empty:
        return pd.DataFrame()  # Return an empty dataframe if the sub-table is empty

    # Set the first row as header and remove it from the data
    sub_table.columns = sub_table.iloc[0]
    sub_table = sub_table[1:]

    return sub_table

def extract_tables(file_path):
    """
    Extracts and formats the tables from the CSV file.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    dict: The dictionary containing the formatted tables.
    """
    # Read the CSV data
    df_full = read_csv_data(file_path)

    # Define the header keywords
    header_keywords = ["HS CODE", "DESCRIPTION", "PIECES", "NET NET WEIGHT"]
    start_text = "HS CODE WISE CONSUMPTION"
    sub_table_start_text = "CONSUMPTION OF RAW MATERIAL IMPORTED UNDER SRO 957 (I) 2021 DATED. 30-JULY-2021 (EFS-EXPORT FACILITATION SCHEME)"

    # Find the start row
    start_row = find_start_row(df_full, start_text)
    if pd.isna(start_row):
        raise Exception("Start header 'HS CODE WISE CONSUMPTION' not found in the file")

    # Read the data starting after the "HS CODE WISE CONSUMPTION" row
    data = pd.read_csv(file_path, skiprows=int(start_row) + 1, header=None)

    # Find all the rows where the header appears
    header_indices = find_header_indices(data, header_keywords)
    if not header_indices:
        raise Exception("Header row not found in the file")

    tables = {}
    for i in range(len(header_indices)):
        start_index = header_indices[i]
        end_index = header_indices[i + 1] if i + 1 < len(header_indices) else len(data)

        # Extract the table
        table = data.iloc[start_index:end_index].reset_index(drop=True)

        # Set the first row as header
        table.columns = table.iloc[0]
        table = table[1:]

        # Remove rows where all elements are NaN
        table = table.dropna(how='all')

        # Extract the main details from the second row
        main_details = extract_main_details(table)

        # Extract the sub-table
        sub_table = extract_sub_table(table, sub_table_start_text)
        sub_table_dict = sub_table.to_dict(orient='records') if not sub_table.empty else []

        # Format the final table dictionary
        hs_code = main_details['HS CODE']
        final_table_dict = {
            "main_details": main_details,
            "sub_table": sub_table_dict
        }

        # Add the formatted table to the tables dictionary
        tables[hs_code] = final_table_dict

    return tables

# Path to the CSV file
csv_file_path = r"C:\Users\MAB\Downloads\PWSAutomate\uploads\EFS518368.csv"

# Extract and format the tables
formatted_tables = extract_tables(csv_file_path)
