In [23]:
import os
import pandas as pd

def one_hot_to_integer(one_hot_sequence):
    """Convert a one-hot encoded sequence to an integer based on the position of the '1'."""
    for idx, val in enumerate(one_hot_sequence):
        if val == 1:
            return idx
    return None  # Return None if no '1' is found

def f3_split(df):
    split_columns = {}
    for i in range(7):
        col_name = f'F3_{chr(ord("a") + i)}'
        split_columns[col_name] = df.iloc[i + 19, 3]
    return split_columns

def extract_id_from_of_sheet(excel_file):
    # Read the 'OF' sheet
    df = pd.read_excel(excel_file, sheet_name='OF', header=None)
    # Extract the ID from cell D4
    id_value = df.iat[3, 3]
    # Convert ID to numeric if possible
    try:
        id_value = pd.to_numeric(id_value)
    except ValueError:
        pass  # ID is not numeric, leave it as is
    return id_value

def read_and_save_excel_data(input_files, output_csv):
    # Define the sheets and the cells to extract data from
    sheets = ['OF', 'F']
    data_to_extract = {
        'OF': [('A5', 'D6:D9', 'binary'), ('A10', 'D11:D16', 'binary'), ('A109', 'D109', 'float')],
        'F': [('A5', 'D7:D8,D10:D11', 'binary'), ('A19', 'D20:D26', 'f3_split')]
    }

    # Initialize an empty list to store DataFrames for each file
    df_list = []

    # Iterate over the input Excel files
    for excel_file in input_files:
        # Extract the ID from the 'OF' sheet
        file_id = extract_id_from_of_sheet(excel_file)

        # Initialize a dictionary to store extracted data for this file
        extracted_data = {'id': file_id}

        # Iterate over the sheets and extract data
        for sheet in sheets:
            df = pd.read_excel(excel_file, sheet_name=sheet, header=None)

            for col_name_cell, data_cells, processing_method in data_to_extract[sheet]:
                # Extract the column name
                col_name_row = int(col_name_cell[1:]) - 1
                col_name_col = ord(col_name_cell[0]) - ord('A')
                col_name = df.iat[col_name_row, col_name_col]

                if pd.isna(col_name):
                    print(f"Warning: Column name at {col_name_cell} is NaN.")
                    continue

                if processing_method == 'binary':
                    # Extract and concatenate the cell ranges
                    data_ranges = [range_str.split(':') for range_str in data_cells.split(',')]
                    concatenated_data = []
                    for range_start, range_end in data_ranges:
                        start_row = int(range_start[1:]) - 1
                        end_row = int(range_end[1:]) - 1
                        start_col = ord(range_start[0]) - ord('A')
                        end_col = ord(range_end[0]) - ord('A')
                        concatenated_data.extend(df.iloc[start_row:end_row + 1, start_col:end_col + 1].values.flatten())

                    # Convert the concatenated data to one-hot encoded sequence
                    encoded_value = one_hot_to_integer(concatenated_data)
                    extracted_data[col_name] = encoded_value
                elif processing_method == 'float':
                    # Directly extract the float value
                    data_row = int(data_cells[1:]) - 1
                    data_col = ord(data_cells[0]) - ord('A')
                    encoded_value = df.iat[data_row, data_col]
                    extracted_data[col_name] = encoded_value
                elif processing_method == 'f3_split':
                    # Use custom processing method
                    extracted_data.update(f3_split(df))
                else:
                    print(f"Warning: Unknown processing method '{processing_method}' for column '{col_name}'. Skipping.")

        # Append the extracted data for this file to the list of DataFrames
        df_list.append(pd.DataFrame([extracted_data]))

    # Concatenate all DataFrames into a single DataFrame
    extracted_df = pd.concat(df_list, ignore_index=True)

    # Reorder columns so 'id' column is first
    extracted_df = extracted_df[['id'] + [col for col in extracted_df.columns if col != 'id']]

    # Write the DataFrame to a CSV file
    extracted_df.to_csv(output_csv, index=False)

# Specify the input folder containing Excel files and output CSV file
input_folder = 'tmp'
output_csv_file = 'output.csv'

# Get a list of all Excel files in the input folder
input_files = [os.path.join(input_folder, file) for file in os.listdir(input_folder) if file.endswith('.xlsx')]

# Read data from Excel files, encode, and save to CSV
read_and_save_excel_data(input_files, output_csv_file)


PermissionError: [Errno 13] Permission denied: 'output.csv'