In [None]:
import pandas as pd
import numpy as np
import time 

In [None]:
import logging
LOG_FORMAT = "%(asctime)s - %(message)s"
logging.basicConfig(level = logging.INFO,
                    filename = 'output.log', 
                    format= LOG_FORMAT,
                    filemode = 'a')
logger = logging.getLogger('Cell Voltages')

# Loading Data

In [None]:
logs_balancing_cell_voltages = pd.read_csv('log_balancing_cell_voltages copy.csv')
logger.info(f"Loaded {logs_balancing_cell_voltages.shape[0]} rows from log_balancing_cell_voltages.csv")
logs_balancing_cell_voltages.head(5)

# Logling Parsing

In [None]:
def parse_logline(df):
    # Extract the logline from the dataframe
    logline = df.logline.copy()
    
    # Extract the cell voltages from the logline
    Cellvoltage = (
        logline.str.split(' SoC: ', expand=True)[0]
        .str.split(': ', expand=True)[1]
        .str.findall(r'[0-9]+')
    )
    Cellvoltage = pd.DataFrame(Cellvoltage.tolist())
    logger.info('Extracted Cell Voltages')
    
    # Change dtype of all columns to float
    Cellvoltage         = Cellvoltage.astype(float)
    min_voltage         = Cellvoltage.min(axis=1)
    max_voltage         = Cellvoltage.max(axis=1)
    
    # Extract indexes of max voltage occourences in every row
    req_indices = np.argwhere(Cellvoltage.values == np.amax(Cellvoltage.values, axis=1).reshape(-1,1))
    max_voltage_idx = [0]*len(max_voltage)
    for x,y in req_indices:
        if max_voltage_idx[x] == 0:
            max_voltage_idx[x] = [y]
        else:
            max_voltage_idx[x].append(y)
    
    # Extract indexes of min voltage occourences in every 
    req_indices = np.argwhere(Cellvoltage.values == np.amin(Cellvoltage.values, axis=1).reshape(-1,1))
    min_voltage_idx = [0]*len(min_voltage)
    for x,y in req_indices:
        if min_voltage_idx[x] == 0:
            min_voltage_idx[x] = [y]
        else:
            min_voltage_idx[x].append(y)
    
    Cellvoltage['min_voltage']      = min_voltage
    Cellvoltage['max_voltage']      = max_voltage
    
    # Adding min_index and max_index columns to the dataframe after converting them to strings because pyarrow doesn't support lists
    Cellvoltage['min_voltage_idx']  = pd.Series(min_voltage_idx).apply(np.array)
    #Cellvoltage['min_voltage_idx']  = Cellvoltage['min_voltage_idx'].astype(str) 
    Cellvoltage['max_voltage_idx']  = pd.Series(max_voltage_idx).apply(np.array)
    #Cellvoltage['max_voltage_idx']  = Cellvoltage['max_voltage_idx'].astype(str)
    logger.info('Extracted max, min Voltages and respective indices')
    
    Cellvoltage.columns = ['s'+ str(i) for i in range(1, 15)] + ['min_voltage', 'max_voltage', 'min_voltage_idx', 'max_voltage_idx']
    
    # Extract the balancing information from the logline
    Balancing = (
        logline.str.split(' SoC: ', expand=True)[0]
        .str.split(': ', expand=True)[1]
        .str.replace(r'\[[0-9]+\]', '', regex=True)
        .str.slice(1,-1)
        .str.replace('] [', ',', regex=False)
        .str.replace('X', '1', regex=False)
        .str.replace(' ', '0', regex=False)
        .str.split(',', expand=True)
    )
    Balancing.replace('1', True)
    Balancing.replace('0', False)
    Balancing.columns = ['s'+ str(i) + '_B' for i in range(1, 15)]
    logger.info('Extracted Balancing Info')
    
    # Extract the metrics from the logline
    metric_split = (
        logline.str.split('SoC: ', expand=True)[1]
        .str.replace('% Current: ', ',', case=False, regex=True)
        .str.replace('A Limit: ', ',', case=False, regex=True)
        .str.replace('A/', ',', case=False, regex=True)
        .str.replace('A Temp: ', ',', case=False, regex=True)
        .str.replace('C Voltage: ', ',', case=False, regex=True)
        .str.replace('V', '', case=False, regex=True)
        .str.split(',', expand=True)
    )
    metric_split.columns=['Soc', 'Current', 'lower_current_limit', 'upper_current_limit', 'Temp', 'Voltage']
    logger.info('Extracted Metrics (e.g. SoC, Current, etc)')
    
    # Split the temperature into lower and upper limits
    temperature_split = metric_split['Temp'].str.split('-', expand=True)
    temperature_split.columns = ['lower_temp', 'upper_temp']

    return pd.concat([df.drop(columns=['logline']), metric_split.drop(columns='Temp'), temperature_split, Cellvoltage,  Balancing], axis=1) 
    

In [None]:
import time 

start = time.time()
logs_balancing_cell_voltages = parse_logline(logs_balancing_cell_voltages.copy())
logger.info(f'Data Successfully Parsed in {time.time() - start}s')
print(f'Data Successfully Parsed in {time.time() - start}s')

# Save Processed Data

In [None]:
try:
    # If the parquet file already exists, load it and append the new data
    previous_table = pd.read_parquet('processed_voltage_data.parquet')
    logs_balancing_cell_voltages = pd.concat([previous_table, logs_balancing_cell_voltages], axis=0)
    logger.info('processed_voltage_data.parquet already exists. Appending the new data into the file')
except FileNotFoundError as e:
    # Otherwise save the new dat|a to a new parquet file
    logger.info('processed_voltage_data.parquet does not exist. Creating.')
    pass
logs_balancing_cell_voltages.to_parquet('processed_voltage_data.parquet', index=False)

In [None]:
logger.info('Data Successfully saved as processed_voltage_data.parquet \n')

In [None]:
print("Execution Successful! Logs are saved in output.log")

# View Delta Table

In [None]:
pd.read_parquet('processed_voltage_data.parquet')