In [None]:
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, IntegerType
from pyspark.sql.functions import col, split, regexp_replace, substring_index, cast, lit, greatest, least, coalesce, monotonically_increasing_id

In [None]:
import logging
LOG_FORMAT = "%(asctime)s - %(message)s"
logging.basicConfig(level = logging.INFO,
                    filename = 'output.log', 
                    format= LOG_FORMAT,
                    filemode = 'a')
logger = logging.getLogger('Cell Voltages')

# Initializing Spark Session

In [None]:
spark = SparkSession.builder.appName('Log Balancing Cell Voltages').getOrCreate()

# Loading Data

In [None]:
logs_balancing_cell_voltages = spark.read.option('header', 'true').csv('log_balancing_cell_voltages copy.csv')
logger.info(f"Loaded {logs_balancing_cell_voltages.count()} rows from log_balancing_cell_voltages.csv")

# Logling Parsing

In [None]:
def parse_logline(logs_balancing_cell_voltages):
    
    # Split the dataframe into the balancing voltage and the metrics columns
    df = logs_balancing_cell_voltages.select(
        split(col('logline'), ' SoC: ').alias('logline')
    )
    balancing_voltage = df.select(df.logline[0].alias('balancing_voltage'))
    metrics = df.select(df.logline[1].alias('metrics'))

    # Extract the voltages from the first half of the logline
    voltages = balancing_voltage.select(
        split(
            #Separate the number into a List type
            substring_index(
                # Remove the brakets at the end
                regexp_replace(
                    # Remove the brackets at the start
                    regexp_replace( 
                        #Remove the intermediate brackets
                        regexp_replace(
                            split(col('balancing_voltage'),': ')[1],
                            r'X',
                            ' '                    
                        ),
                        r'\] \[ \]\[' ,
                        ' '
                    ),
                    '\[ \]\[',
                    ''
                ),
                ']',
                1
            ),
            ' ',
        )
        .alias('balancing_voltage')
    )
    # Split the number lists into separate columns and cast them to double
    voltages = voltages.select(
        col('balancing_voltage')[0].cast('double').alias('s1'),
        col('balancing_voltage')[2].cast('double').alias('s3'),
        col('balancing_voltage')[1].cast('double').alias('s2'),
        col('balancing_voltage')[3].cast('double').alias('s4'),
        col('balancing_voltage')[4].cast('double').alias('s5'),
        col('balancing_voltage')[5].cast('double').alias('s6'),
        col('balancing_voltage')[6].cast('double').alias('s7'),
        col('balancing_voltage')[7].cast('double').alias('s8'),
        col('balancing_voltage')[8].cast('double').alias('s9'),
        col('balancing_voltage')[9].cast('double').alias('s10'),
        col('balancing_voltage')[10].cast('double').alias('s11'),
        col('balancing_voltage')[11].cast('double').alias('s12'),
        col('balancing_voltage')[12].cast('double').alias('s13'),
        col('balancing_voltage')[13].cast('double').alias('s14')
    )
    logger.info('Extracted Cell Voltages')

    # Find the maximum and minimum voltages for every column
    voltages_np = np.array(voltages.collect())

    voltage_max = np.amax(voltages_np, axis=1).reshape(-1,1)
    voltage_min = np.amin(voltages_np, axis=1).reshape(-1,1)

    # Extract indexes of max voltage occourences in every row
    req_indices = np.argwhere(voltages_np == voltage_max)
    max_voltage_idx = [0]*len(voltage_max)
    for x,y in req_indices:
        if max_voltage_idx[x] == 0:
            max_voltage_idx[x] = [int(y)]
        else:
            max_voltage_idx[x].append(int(y))
            
    # Extract indexes of min voltage occourences in every row
    req_indices = np.argwhere(voltages_np == voltage_min)
    min_voltage_idx = [0]*len(voltage_min)
    for x,y in req_indices:
        if min_voltage_idx[x] == 0:
            min_voltage_idx[x] = [int(y)]
        else:
            min_voltage_idx[x].append(int(y))
        
    # Create a dataframe with the max and min voltages and their indexes
    schema = StructType([ 
        StructField("max_voltage_index", ArrayType(IntegerType()),True), 
        StructField("min_voltage_index", ArrayType(IntegerType()),True), 
        StructField("max_voltage", FloatType(), True), 
        StructField("min_voltage", FloatType(), True)
    ])

    voltage_min_mix = spark.createDataFrame(
        data= pd.DataFrame({
            'max_voltage_index' :  max_voltage_idx,
            'min_voltage_index' : min_voltage_idx,
            'max_voltage' : [float(x) for x in voltage_max.reshape(-1)],
            'min_voltage' : [float(x) for x in voltage_min.reshape(-1)]
            }
        ),
        schema=schema
    )
    logger.info('Extracted max, min Voltages and respective indices')

    # Extract the balancing status from the first half of the logline
    balancing = balancing_voltage.select(
        split(
            regexp_replace(
                regexp_replace(
                    regexp_replace(
                        split(col('balancing_voltage'),': ')[1],
                        r'\[[0-9]+\]',
                        ''
                    ),
                    r'\[X\]',
                    '1'
                ),
                r'\[ \]',
                '0' 
            ),
            ' '
        )
        .alias('balancing_voltage')
    )
    
    # Split the number lists into separate columns and cast them to int
    balancing = balancing.select(
        col('balancing_voltage')[0].cast('int').alias('s1_B'),
        col('balancing_voltage')[2].cast('int').alias('s3_B'),
        col('balancing_voltage')[1].cast('int').alias('s2_B'),
        col('balancing_voltage')[3].cast('int').alias('s4_B'),
        col('balancing_voltage')[4].cast('int').alias('s5_B'),
        col('balancing_voltage')[5].cast('int').alias('s6_B'),
        col('balancing_voltage')[6].cast('int').alias('s7_B'),
        col('balancing_voltage')[7].cast('int').alias('s8_B'),
        col('balancing_voltage')[8].cast('int').alias('s9_B'),
        col('balancing_voltage')[9].cast('int').alias('s10_B'),
        col('balancing_voltage')[10].cast('int').alias('s11_B'),
        col('balancing_voltage')[11].cast('int').alias('s12_B'),
        col('balancing_voltage')[12].cast('int').alias('s13_B'),
        col('balancing_voltage')[13].cast('int').alias('s14_B')
    )
    logger.info('Extracted Balancing Info')

    # Extract the metrics from the second half of the logline
    metrics = metrics.select(
        split(
            regexp_replace(
                regexp_replace(
                    regexp_replace(
                        regexp_replace(
                            regexp_replace(
                                regexp_replace(
                                    col('metrics'),
                                    '% Current: ',
                                    ','
                                ),
                                'A Limit: ',
                                ','
                            ),
                            'A/',
                            ','
                        ),
                        'A Temp: ',
                        ','
                    ),
                    'C Voltage: ',
                    ','
                ),
                'V',
                ''
            ),
            ','
        ).alias('metrics')
    )

    # Split the number lists into separate columns and cast them to double (except for the temperature)
    metrics = metrics.select(
        col('metrics')[0].cast('double').alias('Soc'),
        col('metrics')[2].cast('double').alias('Current'),
        col('metrics')[1].cast('double').alias('lower_current_limit'),
        col('metrics')[3].cast('double').alias('upper_current_limit'),
        col('metrics')[4].alias('Temp'),
        col('metrics')[5].cast('double').alias('Voltage')
    )

    # Split the temperature into lower and upper limit and cast them to double
    metrics = metrics.withColumn(
        'Temp', 
        split(col('Temp'), '-')
    ).select(
        col('Soc'),
        col('Current'),
        col('lower_current_limit'),
        col('upper_current_limit'),
        col('Temp')[0].cast('double').alias('lower_temp'),
        col('Temp')[1].cast('double').alias('upper_temp'),
        col('Voltage')
    )
    logger.info('Extracted Metrics (e.g. SoC, Current, etc)')

    # Add an incrementally increasing id to each row
    logs_balancing_cell_voltages = logs_balancing_cell_voltages.drop(col('logline')).withColumn("id",monotonically_increasing_id())
    voltages = voltages.withColumn("id",monotonically_increasing_id())
    voltage_min_mix = voltage_min_mix.withColumn("id",monotonically_increasing_id())
    balancing = balancing.withColumn("id",monotonically_increasing_id())
    metrics = metrics.withColumn("id",monotonically_increasing_id())

    # Join all dataframes usign the id column
    logs_balancing_cell_voltages = (
        logs_balancing_cell_voltages
        .join(voltages, on='id', how='left')
        .join(voltage_min_mix, on = 'id', how='left')
        .join(balancing, on = 'id', how='left')
        .join(metrics, on = 'id', how='left')
    ).drop(col('id'))
    logger.info('Joined all dataframes')
    
    return logs_balancing_cell_voltages


In [None]:
import time 

start = time.time()
logs_balancing_cell_voltages = parse_logline(logs_balancing_cell_voltages)
logger.info(f'Data Successfully Parsed in {time.time() - start}s')
print(f'Data Successfully Parsed in {time.time() - start}s')

# Save Processed Data

In [None]:
delta_table_path = 'processed_voltage_data.parquet'
try:
    # If the parquet file already exists, load it and append the new data
    previous_table = spark.read.parquet(delta_table_path)
    logs_balancing_cell_voltages.write.mode('append').parquet(delta_table_path) 
    logger.info('processed_voltage_data.parquet already exists. Appending the new data into the file')
except AnalysisException as e:
    # Otherwise save the new dat|a to a new parquet file
    logger.info(f'{delta_table_path} does not exist. Creating.')
    logs_balancing_cell_voltages.write.format("parquet").option("primaryKeyFields", 'mainControllerSerialDeviceId').save(delta_table_path)

In [None]:
logger.info('Data Successfully saved as processed_voltage_data.parquet \n')

In [None]:
print("Execution Successful! Logs are saved in output.log")