In [12]:
# STEP-1: LOAD LIBRARIES
# Standard library
import os
from pathlib import Path

# Third-party libraries
import duckdb
import matplotlib.pyplot as plt

# Custom osbad library for anomaly detection
import osbad.config as bconf
from osbad.database import BenchDB

In [2]:
# Path to database directory
DB_DIR = bconf.DB_DIR

db_filepath = DB_DIR.joinpath("train_dataset_severson.db")

In [3]:
# Create a DuckDB connection
con = duckdb.connect(
    db_filepath,
    read_only=True)

# Load all training dataset from duckdb
df_duckdb = con.execute(
    "SELECT * FROM df_train_dataset_sv").fetchdf()

# Get the cell index of training dataset
unique_cell_index_train = df_duckdb["cell_index"].unique()
print(f"Unique cell index: {unique_cell_index_train}")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Unique cell index: ['2017-05-12_5_4C-50per_3C_CH13' '2017-05-12_5_4C-50per_3C_CH14'
 '2017-05-12_5_4C-60per_3C_CH15' '2017-05-12_5_4C-60per_3C_CH16'
 '2017-05-12_5_4C-70per_3C_CH17' '2017-05-12_5_4C-70per_3C_CH18'
 '2017-05-12_6C-40per_3C_CH25' '2017-05-12_6C-40per_3C_CH26'
 '2017-05-12_6C-50per_3C_CH27' '2017-05-12_6C-50per_3C_CH28'
 '2017-05-12_6C-60per_3C_CH29' '2017-05-12_6C-60per_3C_CH30'
 '2017-05-12_7C-40per_3C_CH37' '2017-05-12_7C-40per_3C_CH38'
 '2017-05-12_4C-80per_4C_CH5' '2017-05-12_4C-80per_4C_CH6'
 '2017-05-12_4_4C-80per_4_4C_CH7' '2017-05-12_5_4C-80per_5_4C_CH11'
 '2017-05-12_5_4C-80per_5_4C_CH12' '2017-05-12_3_6C-80per_3_6C_CH1'
 '2017-05-12_3_6C-80per_3_6C_CH2' '2017-05-12_3_6C-80per_3_6C_CH3'
 '2017-05-12_5_4C-40per_3_6C_CH19']


In [5]:
# Get the cell-ID from cell_inventory
selected_cell_label = "2017-05-12_5_4C-70per_3C_CH17"

# Create a subfolder to store fig output
# corresponding to each cell-index
selected_cell_artifacts_dir = bconf.artifacts_output_dir(
    selected_cell_label)

In [6]:
# -------------------------------------------------------------------------
# STEP-3: LOAD BENCHMARKING DATASET

# Import the BenchDB class
# Load only the dataset based on the selected cell
benchdb = BenchDB(
   db_filepath,
   selected_cell_label)

# load the benchmarking dataset
df_selected_cell = benchdb.load_benchmark_dataset(
   dataset_type="train")

if df_selected_cell is not None:

   filter_col = [
      "cell_index",
      "cycle_index",
      "discharge_capacity",
      "voltage"]

   # Drop true labels from the benchmarking dataset
   # and filter for selected columns only
   df_selected_cell_without_labels = benchdb.drop_labels(
      df_selected_cell,
      filter_col)

   # Extract true outliers cycle index from benchmarking dataset
   true_outlier_cycle_index = benchdb.get_true_outlier_cycle_index(
      df_selected_cell)
   print(f"True outlier cycle index:")
   print(true_outlier_cycle_index)

Database is found in the given filepath.
Loading benchmarking dataset now...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

****************************************************************************************************
True outlier cycle index:
[  0.  40. 147. 148.]


In [9]:
df_selected_cell

Unnamed: 0,test_time,cycle_index,cell_index,voltage,discharge_capacity,current,internal_resistance,temperature,outlier
2479567,1801.8192,0.0,2017-05-12_5_4C-70per_3C_CH17,3.291887,0.000013,-0.429022,0.021567,30.086733,1
2479568,1810.4760,0.0,2017-05-12_5_4C-70per_3C_CH17,3.281853,0.001368,-0.562722,0.021567,30.087885,1
2479569,1820.4780,0.0,2017-05-12_5_4C-70per_3C_CH17,3.277881,0.002930,-0.562731,0.021567,30.103170,1
2479570,1830.4810,0.0,2017-05-12_5_4C-70per_3C_CH17,3.275130,0.004494,-0.562708,0.021567,30.103170,1
2479571,1830.4811,0.0,2017-05-12_5_4C-70per_3C_CH17,3.275130,0.004494,-0.562708,0.021567,30.115175,1
...,...,...,...,...,...,...,...,...,...
2792888,288803.7484,40.0,2017-05-12_5_4C-70per_3C_CH17,2.000171,2.883993,-0.111021,0.019886,31.207449,1
2792889,288804.7476,40.0,2017-05-12_5_4C-70per_3C_CH17,2.000284,2.884024,-0.111053,0.019886,31.207449,1
2792890,288805.7468,40.0,2017-05-12_5_4C-70per_3C_CH17,1.999994,2.884054,-0.111295,0.019886,31.207449,1
2792891,288806.6752,40.0,2017-05-12_5_4C-70per_3C_CH17,2.000312,2.884083,-0.109933,0.019886,31.207449,1


In [11]:
df_selected_cell.describe()

Unnamed: 0,test_time,cycle_index,voltage,discharge_capacity,current,internal_resistance,temperature,outlier
count,313326.0,313326.0,313326.0,313326.0,313326.0,313326.0,313326.0,313326.0
mean,179309.202977,331.490464,2.702667,0.697437,-3.262099,0.016591,33.592749,0.046613
std,106539.393974,206.385643,0.51259,0.436628,1.796975,0.00106,2.206655,0.210808
min,0.0,0.0,1.996414,0.0,-4.408104,0.015552,25.144476,0.0
25%,87235.56325,148.0,2.028317,0.347776,-4.400076,0.015931,31.831541,0.0
50%,175271.85955,328.0,2.922445,0.818756,-4.399904,0.016098,33.575077,0.0
75%,274878.008975,512.0,3.117314,0.998362,-1.099984,0.017006,35.282305,0.0
max,390923.6952,691.0,4.623832,2.884083,-0.021661,0.021567,39.733044,1.0


In [15]:

# Assuming you have a DataFrame called df
size_bytes = df_selected_cell.memory_usage(deep=True).sum()

# Convert to human-readable format
def human_readable_size(size, decimal_places=2):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if size < 1024:
            return f"{size:.{decimal_places}f} {unit}"
        size /= 1024

print(f"Size of DataFrame: {human_readable_size(size_bytes)}")

Size of DataFrame: 44.82 MB
