In [None]:
import scipy.io as sio
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", None)


In [None]:
def extract_rfud_steps(mat_path):
    mat = sio.loadmat(mat_path)
    outer = mat["data"]
    steps = outer["step"][0][0]

    records = []
    time_offset = 0.0

    for i in range(steps.shape[1]):
        step = steps[0, i]

        try:
            t = step["relativeTime"].flatten()
            v = step["voltage"].flatten()
            c = step["current"].flatten()
            temp = step["temperature"].flatten()
        except Exception:
            continue

        n = min(len(t), len(v), len(c), len(temp))
        if n < 10:
            continue

        t = t[:n] + time_offset

        df_step = pd.DataFrame({
            "time_s": t,
            "voltage_V": v[:n],
            "current_A": c[:n],
            "temperature_C": temp[:n],
            "step_id": i
        })

        records.append(df_step)
        time_offset = t.max() + 0.01

    df = pd.concat(records, ignore_index=True)
    df["source_file"] = mat_path.stem

    return df


In [None]:
def clean_and_segment(df, min_cycle_len=150):
    df = df.dropna()

    df = df[
        (df["voltage_V"] > 2.0) & (df["voltage_V"] < 4.5) &
        (df["temperature_C"] > -20) & (df["temperature_C"] < 80)
    ]

    df["cycle_id"] = (
        (df["current_A"].shift(1) <= 0) &
        (df["current_A"] > 0)
    ).cumsum()

    sizes = df.groupby("cycle_id").size()
    valid = sizes[sizes > min_cycle_len].index
    df = df[df["cycle_id"].isin(valid)]

    return df.reset_index(drop=True)


In [None]:
BASE_DIR = Path("/content")

targets = ["RW1.mat", "RW2.mat", "RW7.mat", "RW8.mat"]
cleaned_new = []

for name in targets:
    print(f"Processing {name}")

    df_raw = extract_rfud_steps(BASE_DIR / name)
    df_clean = clean_and_segment(df_raw)

    df_clean.to_parquet(f"cleaned_{name.replace('.mat','')}.parquet", index=False)
    cleaned_new.append(df_clean)

print("✅ RW1, RW2, RW7, RW8 cleaned")


Processing RW1.mat
Processing RW2.mat
Processing RW7.mat
Processing RW8.mat
✅ RW1, RW2, RW7, RW8 cleaned


In [None]:
pd.concat(cleaned_new).groupby("source_file").agg(
    rows=("time_s", "count"),
    steps=("step_id", "nunique"),
    cycles=("cycle_id", "nunique"),
    v_min=("voltage_V", "min"),
    v_max=("voltage_V", "max"),
    t_min=("temperature_C", "min"),
    t_max=("temperature_C", "max"),
)


Unnamed: 0_level_0,rows,steps,cycles,v_min,v_max,t_min,t_max
source_file,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
RW1,2940371,7962,1321,3.194,4.209,17.06738,44.65291
RW2,260206,1570,238,3.2,4.2,-19.98637,44.15196
RW7,2925025,8050,1350,3.199,4.288,18.85598,50.95663
RW8,2783700,7586,1350,3.199,4.208,18.7332,48.77631


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path("/content")

files = [
    "cleaned_RW1.parquet",
    "cleaned_RW2.parquet",
    "cleaned_RW7.parquet",
    "cleaned_RW8.parquet",
]

for f in files:
    print(f, "✅" if (BASE_DIR / f).exists() else "❌")


cleaned_RW1.parquet ✅
cleaned_RW2.parquet ✅
cleaned_RW7.parquet ✅
cleaned_RW8.parquet ✅


In [None]:
dfs = [pd.read_parquet(BASE_DIR / f) for f in files]

df_rw1278 = pd.concat(dfs, ignore_index=True)

df_rw1278.shape


(8909302, 7)

In [None]:
# Drop any existing time column (authoritative reset)
df_rw1278 = df_rw1278.drop(columns=["time_s"], errors="ignore")

# Stable ordering (VERY IMPORTANT)
df_rw1278 = (
    df_rw1278
    .reset_index(drop=True)
    .assign(_row=lambda x: x.index)
    .sort_values(
        ["source_file", "cycle_id", "step_id", "_row"],
        kind="mergesort"
    )
    .reset_index(drop=True)
    .drop(columns="_row")
)

# Rebuild time_s
df_rw1278["time_s"] = np.arange(len(df_rw1278), dtype=float)


In [None]:
ts = df_rw1278["time_s"].to_numpy()

print("Strictly increasing:", (ts[1:] > ts[:-1]).all())
print("NaN:", df_rw1278["time_s"].isna().any())
print("Duplicates:", df_rw1278["time_s"].duplicated().any())

df_rw1278.groupby("cycle_id")["time_s"].apply(
    lambda x: (x.to_numpy()[1:] > x.to_numpy()[:-1]).all()
).value_counts()


Strictly increasing: True
NaN: False
Duplicates: False


Unnamed: 0_level_0,count
time_s,Unnamed: 1_level_1
True,1352


In [None]:
df_rw1278.to_parquet(
    "RFUD_RW1_2_7_8_FINAL.parquet",
    index=False
)
