In [None]:
from pathlib import Path

BASE_DIR = Path(".")   # change if needed, e.g. Path("D:/rfud")

mat_files = list(BASE_DIR.rglob("RW*.mat"))

print(f"Found {len(mat_files)} .mat files:\n")

for f in mat_files:
    print(f"Name: {f.name}")
    print(f"Path: {f.resolve()}")
    print("-" * 60)


Found 4 .mat files:

Name: RW4.mat
Path: /content/RW4.mat
------------------------------------------------------------
Name: RW5.mat
Path: /content/RW5.mat
------------------------------------------------------------
Name: RW6.mat
Path: /content/RW6.mat
------------------------------------------------------------
Name: RW3.mat
Path: /content/RW3.mat
------------------------------------------------------------


In [None]:
import scipy.io as sio

mat = sio.loadmat("/content/RW3.mat")
mat.keys()


dict_keys(['__header__', '__version__', '__globals__', 'data'])

In [None]:
import scipy.io as sio
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", None)

BASE_DIR = Path("/content")
MAT_FILES = sorted(BASE_DIR.glob("RW*.mat"))

print("Files to process:")
for f in MAT_FILES:
    print(f.name)


Files to process:
RW3.mat
RW4.mat
RW5.mat
RW6.mat


In [None]:
mat = sio.loadmat(MAT_FILES[0])
data = mat["data"]

data.dtype


dtype([('step', 'O'), ('procedure', 'O'), ('description', 'O')])

In [None]:
import scipy.io as sio

mat = sio.loadmat("/content/RW3.mat")

# Look at top-level keys
mat.keys()


dict_keys(['__header__', '__version__', '__globals__', 'data'])

In [None]:
outer = mat["data"]
type(outer), outer.shape


(numpy.ndarray, (1, 1))

In [None]:
outer.dtype


dtype([('step', 'O'), ('procedure', 'O'), ('description', 'O')])

In [None]:
import scipy.io as sio

mat = sio.loadmat("/content/RW3.mat")

for k, v in mat.items():
    if not k.startswith("__"):
        print(k, type(v), getattr(v, "dtype", None), getattr(v, "shape", None))


data <class 'numpy.ndarray'> [('step', 'O'), ('procedure', 'O'), ('description', 'O')] (1, 1)


In [None]:
import scipy.io as sio

mat = sio.loadmat("/content/RW3.mat")
outer = mat["data"]

step = outer["step"][0][0]

type(step), step.shape, step.dtype


(numpy.ndarray,
 (1, 12826),
 dtype([('comment', 'O'), ('type', 'O'), ('time', 'O'), ('relativeTime', 'O'), ('voltage', 'O'), ('current', 'O'), ('temperature', 'O'), ('date', 'O')]))

In [None]:
import scipy.io as sio
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", None)

BASE_DIR = Path("/content")
MAT_FILES = sorted(BASE_DIR.glob("RW*.mat"))

print("Found files:")
for f in MAT_FILES:
    print(f.name)


Found files:
RW3.mat
RW4.mat
RW5.mat
RW6.mat


In [None]:
def extract_rfud_steps(mat_path):
    mat = sio.loadmat(mat_path)
    outer = mat["data"]
    steps = outer["step"][0][0]

    records = []
    time_offset = 0.0

    for i in range(steps.shape[1]):
        step = steps[0, i]

        try:
            t = step["relativeTime"].flatten()
            v = step["voltage"].flatten()
            c = step["current"].flatten()
            temp = step["temperature"].flatten()
        except Exception:
            continue

        n = min(len(t), len(v), len(c), len(temp))
        if n < 10:
            continue

        t = t[:n] + time_offset

        df_step = pd.DataFrame({
            "time_s": t,
            "voltage_V": v[:n],
            "current_A": c[:n],
            "temperature_C": temp[:n],
            "step_id": i
        })

        records.append(df_step)

        time_offset = t.max() + 0.01  # small gap to ensure monotonic time

    df = pd.concat(records, ignore_index=True)
    df["source_file"] = mat_path.stem

    return df


In [None]:
def clean_and_segment(df, min_cycle_len=150):
    df = df.dropna()

    df = df[
        (df["voltage_V"] > 2.0) & (df["voltage_V"] < 4.5) &
        (df["temperature_C"] > -20) & (df["temperature_C"] < 80)
    ]

    # Rebuild continuous time
    df["time_s"] -= df["time_s"].min()
    df = df.sort_values("time_s").reset_index(drop=True)

    # Cycle detection (charge start)
    df["cycle_id"] = (
        (df["current_A"].shift(1) <= 0) &
        (df["current_A"] > 0)
    ).cumsum()

    # Remove micro cycles
    sizes = df.groupby("cycle_id").size()
    valid = sizes[sizes > min_cycle_len].index
    df = df[df["cycle_id"].isin(valid)]

    return df.reset_index(drop=True)


In [None]:
cleaned_dfs = []

for mat_file in MAT_FILES:
    print(f"Processing {mat_file.name}")

    df_raw = extract_rfud_steps(mat_file)
    df_clean = clean_and_segment(df_raw)

    out_name = f"cleaned_{mat_file.stem}.parquet"
    df_clean.to_parquet(out_name)

    cleaned_dfs.append(df_clean)

print("✅ All RFUD RW files cleaned successfully")


Processing RW3.mat
Processing RW4.mat
Processing RW5.mat
Processing RW6.mat
✅ All RFUD RW files cleaned successfully


In [None]:
df_rfud_all = pd.concat(cleaned_dfs, ignore_index=True)
df_rfud_all.to_parquet("RFUD_RW3_RW6_ALL.parquet")

df_rfud_all.head()


Unnamed: 0,time_s,voltage_V,current_A,temperature_C,step_id,source_file,cycle_id
0,10.0,4.195,0.04,20.09833,0,RW3,1
1,20.0,4.194,0.04,20.09833,0,RW3,1
2,30.0,4.194,0.04,20.09833,0,RW3,1
3,40.0,4.193,0.04,20.1139,0,RW3,1
4,50.0,4.193,0.04,20.09833,0,RW3,1


In [None]:
df_rfud_all.groupby("source_file").agg(
    rows=("time_s", "count"),
    steps=("step_id", "nunique"),
    cycles=("cycle_id", "nunique"),
    v_min=("voltage_V", "min"),
    v_max=("voltage_V", "max"),
    t_min=("temperature_C", "min"),
    t_max=("temperature_C", "max"),
)


Unnamed: 0_level_0,rows,steps,cycles,v_min,v_max,t_min,t_max
source_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RW3,796904,1948,245,3.2,4.2,19.42298,40.47139
RW4,2743110,6164,958,3.2,4.205,16.45587,40.74296
RW5,2847407,6517,995,3.2,4.205,18.19863,43.14556
RW6,1749062,4048,549,3.2,4.201,18.34167,48.0944


In [None]:
df_rfud_all["time_s"].is_monotonic_increasing


False

In [None]:
df_rfud_all = df_rfud_all.drop(columns=["time_s"])


In [None]:
df_rfud_all = (
    df_rfud_all
    .reset_index(drop=True)            # preserve current row order
    .assign(_row_order=lambda x: x.index)
    .sort_values(
        ["source_file", "cycle_id", "step_id", "_row_order"],
        kind="mergesort"                # stable sort (important)
    )
    .reset_index(drop=True)
    .drop(columns="_row_order")
)


In [None]:
df_rfud_all["time_s"] = np.arange(len(df_rfud_all), dtype=np.float64)


In [None]:
df_rfud_all["time_s"].is_monotonic_increasing


True

In [None]:
print("time dtype:", df_rfud_all["time_s"].dtype)
print("Any NaN:", df_rfud_all["time_s"].isna().any())
print("Any duplicates:", df_rfud_all["time_s"].duplicated().any())

# Find where monotonicity breaks (if it does)
ts = df_rfud_all["time_s"].values
breaks = np.where(ts[1:] < ts[:-1])[0]
print("Break indices (first 10):", breaks[:10])


time dtype: float64
Any NaN: False
Any duplicates: False
Break indices (first 10): []


In [None]:
ts = df_rfud_all["time_s"].to_numpy()
np.all(ts[1:] >= ts[:-1])


np.True_

In [None]:
df_rfud_all = df_rfud_all.reset_index(drop=True).copy()


In [None]:
df_rfud_all["time_s"].is_monotonic_increasing


True

In [None]:
import numpy as np

# Drop any old time
df_rfud_all = df_rfud_all.drop(columns=["time_s"], errors="ignore")

# Enforce strict global order
df_rfud_all = (
    df_rfud_all
    .reset_index(drop=True)
    .assign(_row_order=lambda x: x.index)
    .sort_values(
        ["source_file", "cycle_id", "step_id", "_row_order"],
        kind="mergesort"
    )
    .reset_index(drop=True)
    .drop(columns="_row_order")
)

# Rebuild time_s from scratch (authoritative)
df_rfud_all["time_s"] = np.arange(len(df_rfud_all), dtype=np.float64)


In [None]:
ts = df_rfud_all["time_s"].to_numpy()

print("Strictly increasing:", (ts[1:] > ts[:-1]).all())
print("Any NaN:", df_rfud_all["time_s"].isna().any())
print("Any duplicates:", df_rfud_all["time_s"].duplicated().any())


Strictly increasing: True
Any NaN: False
Any duplicates: False


In [None]:
df_rfud_all.to_parquet(
    "RFUD_RW3_RW6_ALL_FINAL.parquet",
    index=False
)


In [None]:
df = df_rfud_all  # or load from parquet

print(df.dtypes)


voltage_V        float64
current_A        float64
temperature_C    float64
step_id            int64
source_file       object
cycle_id           int64
time_s           float64
dtype: object


In [None]:
assert df["voltage_V"].between(2.8, 4.3).all()
assert df["temperature_C"].between(-10, 60).all()
assert df["current_A"].abs().max() < 500  # RFUD scale sanity


In [None]:
ts = df["time_s"].to_numpy()
assert (ts[1:] > ts[:-1]).all()


In [None]:
cycle_breaks = (
    df.groupby("cycle_id")["time_s"]
      .apply(lambda x: x.is_monotonic_increasing)
)

print(cycle_breaks.value_counts())


time_s
True    996
Name: count, dtype: int64


In [None]:
df_rfud_all.to_parquet(
    "RFUD_RW3_RW6_ALL_FINAL.parquet",
    index=False
)
