In [None]:
from pathlib import Path

print("ðŸ“‚ Listing /content:")
for p in Path("/content").iterdir():
    print("-", p.name)


ðŸ“‚ Listing /content:
- .config
- cell_7_cyc7600.csv
- cell_6_cyc0300.csv
- cell_4_cyc2100.csv
- cell_2_cyc1800.csv
- cell_5_cyc1600.csv
- cell_5_cyc1900.csv
- cell_6_cyc4400.csv
- cell_5_cyc0500.csv
- cell_6_cyc7800.csv
- cell_6_cyc7700.csv
- cell_7_cyc2200.csv
- cell_6_cyc2200.csv
- cell_4_cyc0400.csv
- cell_5_cyc2500.csv
- cell_2_cyc5200.csv
- cell_5_cyc1000.csv
- cell_2_cyc7400.csv
- cell_1_cyc5700.csv
- cell_2_cyc4800.csv
- cell_6_cyc3100.csv
- cell_6_cyc7100.csv
- cell_7_cyc3300.csv
- cell_1_cyc2300.csv
- cell_6_cyc1600.csv
- cell_1_cyc2200.csv
- cell_0_cyc7600.csv
- cell_1_cyc2900.csv
- cell_7_cyc2300.csv
- cell_2_cyc6600.csv
- cell_2_cyc2400.csv
- cell_6_cyc5900.csv
- cell_3_cyc0700.csv
- cell_5_cyc0400.csv
- cell_1_cyc0200.csv
- cell_7_cyc0100.csv
- cell_1_cyc2600.csv
- cell_1_cyc7500.csv
- cell_7_cyc1100.csv
- cell_0_cyc5100.csv
- cell_2_cyc5600.csv
- cell_4_cyc4100.csv
- cell_2_cyc1200.csv
- cell_7_cyc4400.csv
- cell_7_cyc4500.csv
- cell_0_cyc5700.csv
- cell_7_cyc3200.csv
-

In [None]:
# find any folder containing csv files anywhere
csv_paths = list(Path("/content").rglob("*.csv"))

print("Total CSV files found anywhere:", len(csv_paths))

for p in csv_paths[:20]:
    print(p)


Total CSV files found anywhere: 523
/content/cell_7_cyc7600.csv
/content/cell_6_cyc0300.csv
/content/cell_4_cyc2100.csv
/content/cell_2_cyc1800.csv
/content/cell_5_cyc1600.csv
/content/cell_5_cyc1900.csv
/content/cell_6_cyc4400.csv
/content/cell_5_cyc0500.csv
/content/cell_6_cyc7800.csv
/content/cell_6_cyc7700.csv
/content/cell_7_cyc2200.csv
/content/cell_6_cyc2200.csv
/content/cell_4_cyc0400.csv
/content/cell_5_cyc2500.csv
/content/cell_2_cyc5200.csv
/content/cell_5_cyc1000.csv
/content/cell_2_cyc7400.csv
/content/cell_1_cyc5700.csv
/content/cell_2_cyc4800.csv
/content/cell_6_cyc3100.csv


In [None]:
from pathlib import Path
import pandas as pd

CSV_DIR = Path("/content")

csv_files = sorted(CSV_DIR.glob("cell_*_cyc*.csv"))

print(f"ðŸ“„ Total CSV files: {len(csv_files)}")
print("="*60)

# Show some filenames
print("ðŸ§­ Filename samples:")
for f in csv_files[:15]:
    print(" -", f.name)

print("="*60)

# Inspect schema + size
schemas = {}
rows_info = []

INSPECT_N = 20

for f in csv_files[:INSPECT_N]:
    df = pd.read_csv(f)
    schemas[f.name] = tuple(df.columns)
    rows_info.append({
        "file": f.name,
        "rows": len(df),
        "cols": len(df.columns)
    })

schema_set = set(schemas.values())

print(f"ðŸ§ª Unique schemas found: {len(schema_set)}")
for s in schema_set:
    print("Schema:", s)

print("="*60)

rows_df = pd.DataFrame(rows_info)
print("ðŸ“Š Row statistics:")
print(rows_df.describe())


ðŸ“„ Total CSV files: 519
ðŸ§­ Filename samples:
 - cell_0_cyc0000.csv
 - cell_0_cyc0100.csv
 - cell_0_cyc0200.csv
 - cell_0_cyc0300.csv
 - cell_0_cyc0400.csv
 - cell_0_cyc0500.csv
 - cell_0_cyc0600.csv
 - cell_0_cyc0700.csv
 - cell_0_cyc0800.csv
 - cell_0_cyc0900.csv
 - cell_0_cyc1000.csv
 - cell_0_cyc1100.csv
 - cell_0_cyc1200.csv
 - cell_0_cyc1300.csv
 - cell_0_cyc1400.csv
ðŸ§ª Unique schemas found: 1
Schema: ('time_s', 'voltage_V', 'capacity_Ah', 'temperature_C', 'cell_id', 'cycle_id')
ðŸ“Š Row statistics:
              rows  cols
count    20.000000  20.0
mean   3415.800000   6.0
std     105.944673   0.0
min    3250.000000   6.0
25%    3341.750000   6.0
50%    3413.500000   6.0
75%    3495.000000   6.0
max    3608.000000   6.0


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

CSV_DIR = Path("/content")
csv_files = sorted(CSV_DIR.glob("cell_*_cyc*.csv"))

features = []

for f in csv_files:
    df = pd.read_csv(f)

    # -------- HARD CLEANING --------
    df = df.dropna()

    # enforce monotonic time
    if not df["time_s"].is_monotonic_increasing:
        df = df.sort_values("time_s")

    # sanity filters
    df = df[df["voltage_V"].between(2.5, 4.3)]
    df = df[df["temperature_C"].between(-10, 80)]

    if len(df) < 200:
        continue  # drop tiny / broken cycles

    # -------- FEATURE ENGINEERING --------
    dt = df["time_s"].diff().replace(0, np.nan)
    dV_dt = df["voltage_V"].diff() / dt

    feat = {
        "cell_id": int(df["cell_id"].iloc[0]),
        "cycle_id": int(df["cycle_id"].iloc[0]),

        # voltage
        "V_mean": df["voltage_V"].mean(),
        "V_std": df["voltage_V"].std(),
        "V_min": df["voltage_V"].min(),
        "V_max": df["voltage_V"].max(),
        "dV_dt_mean": dV_dt.mean(),
        "dV_dt_max": dV_dt.max(),

        # capacity
        "V_range": df["voltage_V"].max() - df["voltage_V"].min(),
        "V_mid": df["voltage_V"].quantile(0.5),
        "V_low": df["voltage_V"].quantile(0.1),
        "V_high": df["voltage_V"].quantile(0.9),


        # temperature
        "T_mean": df["temperature_C"].mean(),
        "T_max": df["temperature_C"].max(),
        "T_delta": df["temperature_C"].max() - df["temperature_C"].min(),

        # duration
        "duration_s": df["time_s"].iloc[-1] - df["time_s"].iloc[0],

        "source": "oxford"
    }

    features.append(feat)

# Final feature table
df_oxford_feat = pd.DataFrame(features)

print("âœ… Oxford cycles processed:", len(df_oxford_feat))
df_oxford_feat.head()


âœ… Oxford cycles processed: 519


Unnamed: 0,cell_id,cycle_id,V_mean,V_std,V_min,V_max,dV_dt_mean,dV_dt_max,V_range,V_mid,V_low,V_high,T_mean,T_max,T_delta,duration_s,source
0,0,0,3.735076,0.245422,2.699819,4.191235,-35.948592,38.111247,1.491416,3.748474,3.434403,4.045442,40.41934,41.174809,1.273163,0.04174,oxford
1,0,1,3.735681,0.244972,2.699859,4.192679,-36.342517,34.65244,1.492819,3.748675,3.435121,4.046091,40.327115,41.124866,1.323108,0.041245,oxford
2,0,2,3.736424,0.244468,2.70002,4.192959,-36.676399,34.6438,1.492939,3.749296,3.436424,4.046404,40.327298,41.1124,1.298134,0.040997,oxford
3,0,3,3.736474,0.243927,2.699964,4.192502,-36.930236,45.049657,1.492538,3.74922,3.437671,4.046015,40.322045,41.124866,1.335575,0.040834,oxford
4,0,4,3.737696,0.242782,2.700164,4.192782,-36.822717,34.646588,1.492618,3.749661,3.439596,4.046612,40.314726,41.124866,1.335575,0.040577,oxford


In [None]:
df_oxford_feat.to_parquet(
    "OXFORD_checkpoint_features.parquet",
    index=False
)

print("âœ… Saved OXFORD_checkpoint_features.parquet")


âœ… Saved OXFORD_checkpoint_features.parquet


In [None]:
df = pd.read_parquet("OXFORD_checkpoint_features.parquet")

print(df.shape)
print(df.dtypes)
print(df.memory_usage(deep=True).sum() / 1024, "KB")


(519, 17)
cell_id         int64
cycle_id        int64
V_mean        float64
V_std         float64
V_min         float64
V_max         float64
dV_dt_mean    float64
dV_dt_max     float64
V_range       float64
V_mid         float64
V_low         float64
V_high        float64
T_mean        float64
T_max         float64
T_delta       float64
duration_s    float64
source         object
dtype: object
92.8798828125 KB
