# Polars

In [42]:
import polars as pl
import numpy as np

In [43]:
logs_balancing_cell_voltages = pl.read_csv('log_balancing_cell_voltages copy.csv')

In [44]:
def parse_logline_polars(df):
    df2 = (
        df
        .with_row_count('id')
        .with_columns(pl.col('logline').str.split(' SoC: ').alias('split_str'))
        .explode('split_str')
        .with_columns(
            ("string_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'logline'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
    )
    str1 = pl.DataFrame(df2['string_00'].alias('Cell_Voltage'))
    str2 = pl.DataFrame(df2['string_01'].alias('Metrics'))  
    
    metrics = (
        str2
        .with_row_count('id')
        .with_columns(pl.col('Metrics').str.replace('% Current: ', ',')
                    .str.replace('A Limit: ', ',')
                    .str.replace('A/', ',')
                    .str.replace('A Temp: ', ',')
                    .str.replace('C Voltage: ', ',')
                    .str.replace('V', '')
                    .str.split(',')
                    .alias('split_str')
                    )
        .explode('split_str')
        .with_columns(
            ("Cell_Voltage_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'Metrics'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
    )
    
    temps = (
        metrics
        .with_columns(pl.col('Cell_Voltage_04').str.split('-').alias('split_str'))
        .explode('split_str')
        .with_columns(
            ("temp_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'Cell_Voltage_04'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
    )
    
    metrics = metrics.drop(['id', 'Metrics', 'Cell_Voltage_04'])
    temps = temps.drop(['id', 'Cell_Voltage_04'])
    metrics.columns = ['SoC', 'Current', 'lower_current_limit', 'upper_current_limit', 'Voltage']
    temps.columns = ['lower_temp', 'upper_temp']
    
    Cellvoltage = pl.DataFrame(
        str1
        .with_row_count('id')
        .with_columns(pl.col('Cell_Voltage').str.split(': ').alias('split_str'))
        .explode('split_str')
        .with_columns(
            ("Cell_Voltage_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'Cell_Voltage'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
        ['Cell_Voltage_01']
    )
    
    Cellvoltage_A = (
        Cellvoltage
        .with_row_count('id')
        .with_columns(pl.col('Cell_Voltage_01').str.extract_all(r'[0-9]+').alias('split_str'))
        .explode('split_str')
        .with_columns(
            ("Cell_Voltage_a_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'Cell_Voltage_01'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
        .drop(['id', 'Cell_Voltage_01'])
    )
    Cellvoltage_A = pl.select(s.cast(pl.Float32) for s in Cellvoltage_A)
    
    max_voltage = Cellvoltage_A.max(axis=1)
    min_voltage = Cellvoltage_A.min(axis=1)
    
    req_indices = np.argwhere(Cellvoltage_A.to_numpy() == min_voltage.to_numpy().reshape(-1,1))
    min_voltage_idx = [0]*len(min_voltage)
    for x,y in req_indices:
        if min_voltage_idx[x] == 0:
            min_voltage_idx[x] = [y]
        else:
            min_voltage_idx[x].append(y)
        
    req_indices = np.argwhere(Cellvoltage_A.to_numpy() == max_voltage.to_numpy().reshape(-1,1))
    max_voltage_idx = [0]*len(max_voltage)
    for x,y in req_indices:
        if max_voltage_idx[x] == 0:
            max_voltage_idx[x] = [y]
        else:
            max_voltage_idx[x].append(y)
            

    Cellvoltage_A = Cellvoltage_A.with_columns(
        [
        pl.Series('max_voltage', max_voltage),
        pl.Series('min_voltage', min_voltage),
        pl.Series('min_voltage_idx', min_voltage_idx),
        pl.Series('max_voltage_idx', max_voltage_idx),
        ]
    )
    Cellvoltage_A.columns = ['s'+ str(i) for i in range(1, 15)] + ['min_voltage', 'max_voltage', 'min_voltage_idx', 'max_voltage_idx']
    
    Cellvoltage_B = (
        Cellvoltage
        .with_row_count('id')
        .with_columns(
            pl.col('Cell_Voltage_01')
            .str.replace_all(r'\[[0-9]+\]', '')
            .str.replace_all(r'\] \[', ',')
            .str.slice(1)
            .str.replace_all(']', '')
            .str.replace_all('X', '1')
            .str.replace_all(' ', '0')
            .str.split(',')
            .alias('split_str')
        )
        .explode('split_str')
        .with_columns(
            ("Cell_Voltage_a_" + pl.arange(0, pl.count()).cast(pl.Utf8).str.zfill(2))
            .over("id")
            .alias("col_nm")
        )
        .pivot(
            index=['id', 'Cell_Voltage_01'],
            values='split_str',
            columns='col_nm',
        )
        .with_columns(
            pl.col('^string_.*$').fill_null("")
        )
        .drop(['id', 'Cell_Voltage_01'])
        
    )

    Cellvoltage_B = pl.select(s.cast(pl.Float32) for s in Cellvoltage_B)
    Cellvoltage_B.columns = ['s'+ str(i) + '_B' for i in range(1, 15)]
    
    return pl.concat([df.drop(['logline']), metrics, temps, Cellvoltage_A, Cellvoltage_B], how='horizontal')
    

In [45]:
import time 

start = time.time()
logs_balancing_cell_voltages = parse_logline_polars(logs_balancing_cell_voltages)
print(f'Data Successfully Parsed in {time.time() - start}s')

Data Successfully Parsed in 24.01250171661377s
