In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from google.colab import drive
import scipy.io
import os
#drive.mount('/content/drive')


In [None]:
# Basic feature exraction:
base_folder = '/content/drive/MyDrive/Machine Learning Project (ECE 228)/Battery Dataset'
voltage_cycles = []
current_cycles = []
features = []
from scipy.stats import kurtosis, skew

for root, dirs, files in os.walk(base_folder):
    for filename in sorted(files):
        if filename.endswith('.mat'):

            filepath = os.path.join(root, filename)
            print(f"Processing {filename}...")
            mat_data = scipy.io.loadmat(filepath, squeeze_me=True, struct_as_record=False)

            var_name = os.path.splitext(filename)[0]
            battery = mat_data[var_name]

            cycles = battery.cycle
            cycle_index = 0
            if not isinstance(cycles, (list, tuple, np.ndarray)):
                cycles = [cycles]
            for idx, cycle in enumerate(cycles):
              #for cycle in cycles:
                if hasattr(cycle, 'type') and cycle.type in ['charge']:
                  if hasattr(cycle, 'data'):
                              data = cycle.data
                              if hasattr(data, 'Time') and hasattr(data, 'Voltage_measured') and hasattr(data, 'Current_measured'):
                                  df = pd.DataFrame({
                                      'time': data.Time,
                                      'voltage': data.Voltage_measured,
                                      'current': data.Current_measured
                                  })

                                  cc_start_idx = df[df['voltage'] >= 4.0].index # when the voltage passes 4.0 for the first time that's the start of CC mode
                                  if cc_start_idx.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CC start data.")
                                      continue # Skip to the next cycle
                                  else:
                                      cc_start_idx = cc_start_idx[0]

                                  cc_end_idx = df[df['voltage'] >= 4.2].index # when the voltage hits 4.2 for the first time that's the end of CC mode
                                  if cc_end_idx.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CC end data.")
                                      continue # Skip to the next cycle
                                  else:
                                      cc_end_idx = cc_end_idx[0]

                                  # Check if CC_df can be created
                                  if cc_start_idx is not None and cc_end_idx is not None and cc_start_idx <= cc_end_idx:
                                      CC_df = df.loc[cc_start_idx:cc_end_idx]
                                  else:
                                      print(f"Skipping {filename}_cycle{idx} due to invalid CC range.")
                                      continue # Skip to the next cycle

                                  # Check if CV start index exists
                                  cv_start_indices = df[(df['current'] <= 0.5) & (df['voltage']>=4.0)].index
                                  if cv_start_indices.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CV start data.")
                                      continue # Skip to the next cycle
                                  cv_start_idx = cv_start_indices[0]

                                  # Check if CV end index exists
                                  cv_end_indices = df[(df['current'] <= 0.1) & (df['voltage']>=4.0)].index
                                  if cv_end_indices.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CV end data.")
                                      continue # Skip to the next cycle
                                  cv_end_idx = cv_end_indices[0]

                                  # without the skippin gthere were 1588 rows. we'll see how many there are after 1582 but some of them are still weird
                                  # Check if CV_df can be created
                                  if cv_start_idx <= cv_end_idx:
                                      CV_df = df.loc[cv_start_idx:cv_end_idx]
                                  else:
                                      print(f"Skipping {filename}_cycle{idx} due to invalid CV range.")
                                      continue # Skip to the next cycle

                                  filtered_df = df[
                                      (df['voltage'] >= 4.0) & (df['voltage'] <= 4.2) &
                                      (df['current'] >= 0.1) & (df['current'] <= 0.5)
                                  ] # TODO re-examine this filtering - it's not wrong but apparently the voltage oscillates in this range and we're not capturing the linear portion of the charge cycle
                                  # TODO apparently this is indeed wrong - they are separate ranges - voltage data during constant current mode and current data during constant voltage mode
                                  # TODO see https://github.com/wang-fujin/Battery-dataset-preprocessing-code-library/blob/main/HUSTBatteryClass.py for a filtering implementation

                                  # If filtered data exists, store voltage with time as index
                                  # if not filtered_df.empty:
                                  #     cycle_id = f"{filename}_cycle{idx}"
                                  #     voltage_series = pd.Series(
                                  #         data=filtered_df['voltage'].values,
                                  #         index=filtered_df['time'].values,
                                  #         name=cycle_id
                                  #     )
                                  #     voltage_cycles.append(voltage_series)
                                  #     print(voltage_cycles)

                                  if not filtered_df.empty:
                                    cycle_id = f"{filename}_cycle{idx}"

                                    # Plot voltage vs. time
                                    plt.figure()
                                    plt.plot(filtered_df['time'].values, filtered_df['voltage'].values)
                                    plt.title(f"Voltage vs Time - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Voltage (V)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()

                                    plt.figure()
                                    plt.plot(CC_df['time'].values, CC_df['voltage'].values)
                                    plt.title(f"Voltage vs Time in Constant Current - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Voltage (V)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()


                                    plt.figure()
                                    plt.plot(CV_df['time'].values, CV_df['current'].values)
                                    plt.title(f"Current vs Time in Constant Current - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Current (A)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()

                                    # Also save the voltage series if needed
                                    voltage_series = pd.Series(
                                        data=filtered_df['voltage'].values,
                                        index=filtered_df['time'].values,
                                        name=cycle_id
                                    )
                                    voltage_cycles.append(voltage_series)

                                    current_series = pd.Series(
                                        data=filtered_df['current'].values,
                                        index=filtered_df['time'].values,
                                        name=cycle_id
                                    )
                                    current_cycles.append(current_series)

                                    #print(voltage_series)
                                    cc_v = CC_df['voltage'].values
                                    cc_t = CC_df['time'].values
                                    cc_i = CC_df['current'].values

                                    cv_i = CV_df['current'].values
                                    cv_t = CV_df['time'].values

                                    cycle_index += 1
                                    features.append({
                                        'charge_CC_mean_V': np.mean(cc_v),
                                        'charge_CC_std_V': np.std(cc_v),
                                        'charge_CC_kurtosis_V': kurtosis(cc_v),
                                        'charge_CC_skew_V': skew(cc_v),
                                        'charge_CC_time_V': cc_t[-1] - cc_t[0], #np.shape(cc_v)[0], # basically just the length
                                        'charge_CC_charge': np.trapezoid(cc_i, cc_t),
                                        'charge_CC_slope_V': (cc_v[-1]-cc_v[0])/(cc_t[-1] - cc_t[0]),
                                        'charge_CC_entropy_V': -np.sum(
                                                  cc_v/np.sum(cc_v) *
                                                  np.log(cc_v/np.sum(cc_v))),
                                        'charge_CV_mean_I': np.mean(cv_i),
                                        'charge_CV_std_dev_I': np.std(cv_i),
                                        'charge_CV_kurtosis_I': kurtosis(cv_i),
                                        'charge_CV_skew_I': skew(cv_i),
                                        'charge_CV_time_I': cv_t[-1] - cv_t[0],
                                        'charge_CV_charge_I': np.trapezoid(cv_i, cv_t),
                                        'charge_CV_slope_I': (cv_i[-1]-cv_i[0])/(cv_t[-1] - cv_t[0]),
                                        'charge_CV_entropy_I': -np.sum(
                                                  cv_i/np.sum(cv_i) *
                                                  np.log(cv_i/np.sum(cv_i))),
                                        'cycle_index': cycle_index,
                                        'capacity': np.trapezoid(data.Current_measured, data.Time) / 3600 # the whole charge accrued is current integrated over time divided by seconds in an hour
                                    })


In [None]:
features_df = pd.DataFrame(features)
features_df
features_df.to_csv("/content/drive/MyDrive/Machine Learning Project (ECE 228)/Battery Dataset/features.csv", index=False) # VERY key

In [None]:
# Extracting both basic and extra features:
base_folder = '/content/drive/MyDrive/Machine Learning Project (ECE 228)/Battery Dataset'
voltage_cycles = []
current_cycles = []
features = []
from scipy.stats import kurtosis, skew

for root, dirs, files in os.walk(base_folder):
    for filename in sorted(files):
        if filename.endswith('.mat'):

            filepath = os.path.join(root, filename)
            print(f"Processing {filename}...")
            mat_data = scipy.io.loadmat(filepath, squeeze_me=True, struct_as_record=False)

            var_name = os.path.splitext(filename)[0]
            battery = mat_data[var_name]

            cycles = battery.cycle
            cycle_index = 0
            if not isinstance(cycles, (list, tuple, np.ndarray)):
                cycles = [cycles]
            for idx, cycle in enumerate(cycles):
              #for cycle in cycles:
                if hasattr(cycle, 'type') and cycle.type in ['charge']:
                  if hasattr(cycle, 'data'):
                              data = cycle.data
                              if hasattr(data, 'Time') and hasattr(data, 'Voltage_measured') and hasattr(data, 'Current_measured'):
                                  df = pd.DataFrame({
                                      'time': data.Time,
                                      'voltage': data.Voltage_measured,
                                      'current': data.Current_measured
                                  })

                                  cc_start_idx = df[df['voltage'] >= 4.0].index # when the voltage passes 4.0 for the first time that's the start of CC mode
                                  if cc_start_idx.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CC start data.")
                                      continue # Skip to the next cycle
                                  else:
                                      cc_start_idx = cc_start_idx[0]

                                  cc_end_idx = df[df['voltage'] >= 4.2].index # when the voltage hits 4.2 for the first time that's the end of CC mode
                                  if cc_end_idx.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CC end data.")
                                      continue # Skip to the next cycle
                                  else:
                                      cc_end_idx = cc_end_idx[0]

                                  # Check if CC_df can be created
                                  if cc_start_idx is not None and cc_end_idx is not None and cc_start_idx <= cc_end_idx:
                                      CC_df = df.loc[cc_start_idx:cc_end_idx]
                                  else:
                                      print(f"Skipping {filename}_cycle{idx} due to invalid CC range.")
                                      continue # Skip to the next cycle

                                  # Check if CV start index exists
                                  cv_start_indices = df[(df['current'] <= 0.5) & (df['voltage']>=4.0)].index
                                  if cv_start_indices.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CV start data.")
                                      continue # Skip to the next cycle
                                  cv_start_idx = cv_start_indices[0]

                                  # Check if CV end index exists
                                  cv_end_indices = df[(df['current'] <= 0.1) & (df['voltage']>=4.0)].index
                                  if cv_end_indices.empty:
                                      print(f"Skipping {filename}_cycle{idx} due to missing CV end data.")
                                      continue # Skip to the next cycle
                                  cv_end_idx = cv_end_indices[0]

                                  # without the skippin gthere were 1588 rows. we'll see how many there are after 1582 but some of them are still weird
                                  # Check if CV_df can be created
                                  if cv_start_idx <= cv_end_idx:
                                      CV_df = df.loc[cv_start_idx:cv_end_idx]
                                  else:
                                      print(f"Skipping {filename}_cycle{idx} due to invalid CV range.")
                                      continue # Skip to the next cycle

                                  filtered_df = df[
                                      (df['voltage'] >= 4.0) & (df['voltage'] <= 4.2) &
                                      (df['current'] >= 0.1) & (df['current'] <= 0.5)
                                  ] # TODO re-examine this filtering - it's not wrong but apparently the voltage oscillates in this range and we're not capturing the linear portion of the charge cycle
                                  # TODO apparently this is indeed wrong - they are separate ranges - voltage data during constant current mode and current data during constant voltage mode
                                  # TODO see https://github.com/wang-fujin/Battery-dataset-preprocessing-code-library/blob/main/HUSTBatteryClass.py for a filtering implementation

                                  # If filtered data exists, store voltage with time as index
                                  # if not filtered_df.empty:
                                  #     cycle_id = f"{filename}_cycle{idx}"
                                  #     voltage_series = pd.Series(
                                  #         data=filtered_df['voltage'].values,
                                  #         index=filtered_df['time'].values,
                                  #         name=cycle_id
                                  #     )
                                  #     voltage_cycles.append(voltage_series)
                                  #     print(voltage_cycles)

                                  if not filtered_df.empty:
                                    cycle_id = f"{filename}_cycle{idx}"

                                    # Plot voltage vs. time
                                    plt.figure()
                                    plt.plot(filtered_df['time'].values, filtered_df['voltage'].values)
                                    plt.title(f"Voltage vs Time - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Voltage (V)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()

                                    plt.figure()
                                    plt.plot(CC_df['time'].values, CC_df['voltage'].values)
                                    plt.title(f"Voltage vs Time in Constant Current - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Voltage (V)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()


                                    plt.figure()
                                    plt.plot(CV_df['time'].values, CV_df['current'].values)
                                    plt.title(f"Current vs Time in Constant Current - {cycle_id}")
                                    plt.xlabel("Time (s)")
                                    plt.ylabel("Current (A)")
                                    plt.grid(True)
                                    plt.tight_layout()
                                    plt.show()

                                    # Also save the voltage series if needed
                                    voltage_series = pd.Series(
                                        data=filtered_df['voltage'].values,
                                        index=filtered_df['time'].values,
                                        name=cycle_id
                                    )
                                    voltage_cycles.append(voltage_series)

                                    current_series = pd.Series(
                                        data=filtered_df['current'].values,
                                        index=filtered_df['time'].values,
                                        name=cycle_id
                                    )
                                    current_cycles.append(current_series)

                                    #print(voltage_series)
                                    cc_v = CC_df['voltage'].values
                                    cc_t = CC_df['time'].values
                                    cc_i = CC_df['current'].values

                                    cv_i = CV_df['current'].values
                                    cv_t = CV_df['time'].values

                                    cycle_index += 1
                                    features.append({
                                        'charge_CC_mean_V': np.mean(cc_v),
                                        'charge_CC_std_V': np.std(cc_v),
                                        'charge_CC_kurtosis_V': kurtosis(cc_v),
                                        'charge_CC_skew_V': skew(cc_v),
                                        'charge_CC_time_V': cc_t[-1] - cc_t[0], #np.shape(cc_v)[0], # basically just the length
                                        'charge_CC_charge': np.trapezoid(cc_i, cc_t),
                                        'charge_CC_slope_V': (cc_v[-1]-cc_v[0])/(cc_t[-1] - cc_t[0]),
                                        'charge_CC_entropy_V': -np.sum(
                                                  cc_v/np.sum(cc_v) *
                                                  np.log(cc_v/np.sum(cc_v))),
                                        'charge_CV_mean_I': np.mean(cv_i),
                                        'charge_CV_std_dev_I': np.std(cv_i),
                                        'charge_CV_kurtosis_I': kurtosis(cv_i),
                                        'charge_CV_skew_I': skew(cv_i),
                                        'charge_CV_time_I': cv_t[-1] - cv_t[0],
                                        'charge_CV_charge_I': np.trapezoid(cv_i, cv_t),
                                        'charge_CV_slope_I': (cv_i[-1]-cv_i[0])/(cv_t[-1] - cv_t[0]),
                                        'charge_CV_entropy_I': -np.sum(
                                                  cv_i/np.sum(cv_i) *
                                                  np.log(cv_i/np.sum(cv_i))),
                                        'cycle_index': cycle_index,
                                        'capacity': np.trapezoid(data.Current_measured, data.Time) / 3600 # the whole charge accrued is current integrated over time divided by seconds in an hour
                                    })
