In [None]:
# ===== STEP 2 (FIXED): LOAD RW9.mat USING SCIPY =====

from scipy.io import loadmat

FILE_PATH = "/content/RW9.mat"

mat = loadmat(FILE_PATH)

print("Top-level keys in RW9.mat:")
for k in mat.keys():
    print(" -", k)


Top-level keys in RW9.mat:
 - __header__
 - __version__
 - __globals__
 - data


In [None]:
# ===== STEP 3: INSPECT 'data' STRUCTURE =====

data = mat["data"]

print("Type of data:", type(data))
print("Shape of data:", data.shape)
print("Dtype of data:", data.dtype)

print("\nFields inside data struct:")
print(data.dtype.names)



Type of data: <class 'numpy.ndarray'>
Shape of data: (1, 1)
Dtype of data: [('step', 'O'), ('procedure', 'O'), ('description', 'O')]

Fields inside data struct:
('step', 'procedure', 'description')


In [None]:
# ===== STEP 4: INSPECT 'step' STRUCTURE =====

step = data["step"][0, 0]

print("Type of step:", type(step))
print("Shape of step:", step.shape)
print("Dtype of step:", step.dtype)

print("\nFields inside each step:")
print(step.dtype.names)


Type of step: <class 'numpy.ndarray'>
Shape of step: (1, 113578)
Dtype of step: [('comment', 'O'), ('type', 'O'), ('time', 'O'), ('relativeTime', 'O'), ('voltage', 'O'), ('current', 'O'), ('temperature', 'O'), ('date', 'O')]

Fields inside each step:
('comment', 'type', 'time', 'relativeTime', 'voltage', 'current', 'temperature', 'date')


In [None]:
# ===== STEP 5: INSPECT ONE STEP =====

import numpy as np

step0 = step[0, 0]   # first step

print("Step fields:")
for k in step0.dtype.names:
    val = step0[k]
    print(f"{k:15s} | type: {type(val)} | length:",
          len(val) if hasattr(val, "__len__") else "scalar")

print("\nStep type example:", step0["type"])
print("Voltage sample:", step0["voltage"][:5])
print("Current sample:", step0["current"][:5])
print("Temperature sample:", step0["temperature"][:5])
print("Relative time sample:", step0["relativeTime"][:5])


Step fields:
comment         | type: <class 'numpy.ndarray'> | length: 1
type            | type: <class 'numpy.ndarray'> | length: 1
time            | type: <class 'numpy.ndarray'> | length: 1
relativeTime    | type: <class 'numpy.ndarray'> | length: 1
voltage         | type: <class 'numpy.ndarray'> | length: 1
current         | type: <class 'numpy.ndarray'> | length: 1
temperature     | type: <class 'numpy.ndarray'> | length: 1
date            | type: <class 'numpy.ndarray'> | length: 1

Step type example: ['C']
Voltage sample: [[3.838 3.865 3.878 ... 4.2   4.2   4.2  ]]
Current sample: [[-2.007 -2.    -2.    ... -0.013 -0.013 -0.01 ]]
Temperature sample: [[24.3853  24.3853  24.29163 ... 22.12163 22.16847 22.10602]]
Relative time sample: [[4.000000e-02 1.004000e+01 2.004000e+01 ... 1.049004e+04 1.050004e+04
  1.050466e+04]]


In [None]:
# ===== STEP 6: CLASSIFY ALL STEPS =====

step_types = {"C": 0, "D": 0, "R": 0, "OTHER": 0}

for i in range(step.shape[1]):
    s = step[0, i]
    t = s["type"][0]

    if t in step_types:
        step_types[t] += 1
    else:
        step_types["OTHER"] += 1

print("Step type counts:")
for k, v in step_types.items():
    print(f"{k}: {v}")


Step type counts:
C: 28313
D: 28487
R: 56778
OTHER: 0


In [None]:
# ===== STEP 7: EXTRACT DISCHARGE STEPS =====

import pandas as pd
import numpy as np

rows = []

for i in range(step.shape[1]):
    s = step[0, i]
    step_type = s["type"][0]

    if step_type != "D":
        continue

    t = s["relativeTime"].flatten()
    v = s["voltage"].flatten()
    c = s["current"].flatten()
    temp = s["temperature"].flatten()

    # safety check
    n = min(len(t), len(v), len(c), len(temp))

    for j in range(n):
        rows.append({
            "step_id": i,
            "time_s": float(t[j]),
            "voltage_V": float(v[j]),
            "current_A": float(c[j]),
            "temperature_C": float(temp[j])
        })

df_discharge = pd.DataFrame(rows)

print("Discharge dataframe shape:", df_discharge.shape)
print(df_discharge.head())


Discharge dataframe shape: (3908910, 5)
   step_id  time_s  voltage_V  current_A  temperature_C
0        1    0.04      4.113      1.004       22.10602
1        1   10.04      4.099      1.000       22.05919
2        1   20.04      4.091      1.000       22.04357
3        1   30.04      4.085      1.000       22.01235
4        1   40.04      4.079      1.000       21.98113


In [None]:
# ===== STEP 8: ASSIGN CYCLE ID =====

# map each discharge step_id to a cycle number
discharge_steps = df_discharge["step_id"].unique()
step_to_cycle = {step: idx+1 for idx, step in enumerate(discharge_steps)}

df_discharge["cycle_id"] = df_discharge["step_id"].map(step_to_cycle)

print("Total discharge cycles:", df_discharge["cycle_id"].nunique())
print(df_discharge.head())


Total discharge cycles: 28487
   step_id  time_s  voltage_V  current_A  temperature_C  cycle_id
0        1    0.04      4.113      1.004       22.10602         1
1        1   10.04      4.099      1.000       22.05919         1
2        1   20.04      4.091      1.000       22.04357         1
3        1   30.04      4.085      1.000       22.01235         1
4        1   40.04      4.079      1.000       21.98113         1


In [None]:
FILE_PATH = "/content/RW10.mat"   # adjust if path differs

mat = loadmat(FILE_PATH)

print("Top-level keys:")
for k in mat.keys():
    print(" -", k)

Top-level keys:
 - __header__
 - __version__
 - __globals__
 - data


In [None]:
# ===== RW10: STEP 2 â€” INSPECT DATA STRUCT =====

data = mat["data"]

print("Type:", type(data))
print("Shape:", data.shape)
print("Fields:", data.dtype.names)


Type: <class 'numpy.ndarray'>
Shape: (1, 1)
Fields: ('step', 'procedure', 'description')


In [None]:
# ===== RW10: STEP 3 â€” INSPECT STEP =====

step = data["step"][0, 0]

print("Step type:", type(step))
print("Step shape:", step.shape)
print("Step fields:", step.dtype.names)


Step type: <class 'numpy.ndarray'>
Step shape: (1, 110818)
Step fields: ('comment', 'type', 'time', 'relativeTime', 'voltage', 'current', 'temperature', 'date')


In [None]:
# ===== RW10: STEP 4 â€” COUNT STEP TYPES =====

step_types = {"C": 0, "D": 0, "R": 0, "OTHER": 0}

for i in range(step.shape[1]):
    t = step[0, i]["type"][0]
    if t in step_types:
        step_types[t] += 1
    else:
        step_types["OTHER"] += 1

print("Step type counts:")
for k, v in step_types.items():
    print(k, ":", v)


Step type counts:
C : 27602
D : 27819
R : 55397
OTHER : 0


In [None]:
# ===== RW10: STEP 5 â€” EXTRACT DISCHARGE =====

import pandas as pd
import numpy as np

rows = []

for i in range(step.shape[1]):
    s = step[0, i]
    if s["type"][0] != "D":
        continue

    t = s["relativeTime"].flatten()
    v = s["voltage"].flatten()
    c = s["current"].flatten()
    temp = s["temperature"].flatten()

    n = min(len(t), len(v), len(c), len(temp))

    for j in range(n):
        rows.append({
            "step_id": i,
            "time_s": float(t[j]),
            "voltage_V": float(v[j]),
            "current_A": float(c[j]),
            "temperature_C": float(temp[j])
        })

df_rw10 = pd.DataFrame(rows)

print("RW10 discharge shape:", df_rw10.shape)
print(df_rw10.head())


RW10 discharge shape: (3950492, 5)
   step_id  time_s  voltage_V  current_A  temperature_C
0        1    0.04      4.116      1.005       22.72138
1        1   10.04      4.102      1.000       20.09283
2        1   20.04      4.094      1.000       20.10838
3        1   30.04      4.088      1.000       20.12394
4        1   40.04      4.082      1.000       20.13949


In [None]:
# ===== RW10: STEP 6 â€” ASSIGN CYCLE ID =====

discharge_steps = df_rw10["step_id"].unique()
step_to_cycle = {step: idx + 1 for idx, step in enumerate(discharge_steps)}

df_rw10["cycle_id"] = df_rw10["step_id"].map(step_to_cycle)

print("Total discharge cycles:", df_rw10["cycle_id"].nunique())
print(df_rw10.head())


Total discharge cycles: 27819
   step_id  time_s  voltage_V  current_A  temperature_C  cycle_id
0        1    0.04      4.116      1.005       22.72138         1
1        1   10.04      4.102      1.000       20.09283         1
2        1   20.04      4.094      1.000       20.10838         1
3        1   30.04      4.088      1.000       20.12394         1
4        1   40.04      4.082      1.000       20.13949         1


In [None]:
# ===== RW10: STEP 7 â€” BASIC CLEANING =====

df_rw10 = df_rw10[df_rw10["voltage_V"] > 0]

df_rw10 = df_rw10.sort_values(
    by=["cycle_id", "time_s"]
).reset_index(drop=True)

print("After cleaning shape:", df_rw10.shape)


After cleaning shape: (3950492, 6)


In [None]:
# ===== RW10: STEP 8 â€” SAVE =====

df_rw10.to_parquet(
    "/content/RFUD_RW10_discharge.parquet",
    index=False
)

print("âœ” Saved RFUD_RW10_discharge.parquet")


âœ” Saved RFUD_RW10_discharge.parquet


In [None]:
from scipy.io import loadmat
import pandas as pd
import numpy as np

def process_rfud_mat(file_path):
    mat = loadmat(file_path)
    data = mat["data"]
    step = data["step"][0, 0]

    rows = []

    for i in range(step.shape[1]):
        s = step[0, i]
        if s["type"][0] != "D":
            continue

        t = s["relativeTime"].flatten()
        v = s["voltage"].flatten()
        c = s["current"].flatten()
        temp = s["temperature"].flatten()

        n = min(len(t), len(v), len(c), len(temp))

        for j in range(n):
            rows.append({
                "step_id": i,
                "time_s": float(t[j]),
                "voltage_V": float(v[j]),
                "current_A": float(c[j]),
                "temperature_C": float(temp[j]),
            })

    df = pd.DataFrame(rows)

    # assign cycle_id (per file)
    discharge_steps = df["step_id"].unique()
    step_to_cycle = {step: idx + 1 for idx, step in enumerate(discharge_steps)}
    df["cycle_id"] = df["step_id"].map(step_to_cycle)

    # basic cleaning
    df = df[df["voltage_V"] > 0]
    df = df.sort_values(by=["cycle_id", "time_s"]).reset_index(drop=True)

    return df


In [None]:
df_rw9  = process_rfud_mat("/content/RW9.mat")
print("RW9 done:", df_rw9.shape)

df_rw10 = process_rfud_mat("/content/RW10.mat")
print("RW10 done:", df_rw10.shape)

df_rw11 = process_rfud_mat("/content/RW11.mat")
print("RW11 done:", df_rw11.shape)

df_rw12 = process_rfud_mat("/content/RW12.mat")
print("RW12 done:", df_rw12.shape)


RW9 done: (3908910, 6)
RW10 done: (3950492, 6)
RW11 done: (3992046, 6)
RW12 done: (3938840, 6)


In [None]:
df_rfud_uniform = pd.concat(
    [df_rw9, df_rw10, df_rw11, df_rw12],
    ignore_index=True
)

# reassign GLOBAL cycle_id
df_rfud_uniform["cycle_id"] = (
    df_rfud_uniform.groupby("cycle_id").ngroup() + 1
)

print("Merged shape:", df_rfud_uniform.shape)
print("Total discharge cycles:", df_rfud_uniform["cycle_id"].nunique())


Merged shape: (15790288, 6)
Total discharge cycles: 28487


In [None]:
df_rfud_uniform.to_parquet(
    "/content/RFUD_uniform_charge_discharge.parquet",
    index=False
)

print("âœ” Saved RFUD_uniform_charge_discharge.parquet")


âœ” Saved RFUD_uniform_charge_discharge.parquet


In [None]:
df_rfud_uniform = pd.concat(
    [df_rw9, df_rw10, df_rw11, df_rw12],
    ignore_index=True
)


In [None]:
df_rfud_uniform["cycle_id"] = (
    df_rfud_uniform
    .groupby([ "cycle_id"])
    .ngroup() + 1
)

print("Total discharge cycles:",
      df_rfud_uniform["cycle_id"].nunique())


Total discharge cycles: 28487


In [None]:
# List all DataFrames currently in memory
import pandas as pd

dfs = [name for name, obj in globals().items() if isinstance(obj, pd.DataFrame)]
print("Available DataFrames:", dfs)


Available DataFrames: ['df_discharge', 'df_rw10', 'df_rw9', 'df_rw11', 'df_rw12', 'df_rfud_uniform']


In [None]:
df_all = df_rfud_uniform


In [None]:
import pandas as pd

required_cols = [
    "source_file",
    "step_id",
    "time_s",
    "voltage_V",
    "current_A",
    "temperature_C"
]

print("=== COLUMN CHECK ===")
print("Missing columns:", set(required_cols) - set(df_all.columns))
print("Extra columns:", set(df_all.columns) - set(required_cols))
print()

print("=== BASIC STATS ===")
print("Total rows:", len(df_all))
print("Unique source files:", df_all["source_file"].unique())
print()

print("=== CYCLE CHECK (BEFORE FIX) ===")
print("Unique step_id:", df_all["step_id"].nunique())
print()

# ðŸ”‘ IMPORTANT: build global cycle_id
df_all["cycle_id"] = (
    df_all.groupby(["source_file", "step_id"]).ngroup() + 1
)

print("=== CYCLE CHECK (AFTER FIX) ===")
print("Total discharge cycles:", df_all["cycle_id"].nunique())
print()

print("Cycles per file:")
print(df_all.groupby("source_file")["cycle_id"].nunique())


=== COLUMN CHECK ===
Missing columns: {'source_file'}
Extra columns: {'cycle_id'}

=== BASIC STATS ===
Total rows: 15790288


KeyError: 'source_file'

In [None]:
for name, df in {
    "df_rw9": df_rw9,
    "df_rw10": df_rw10,
    "df_rw11": df_rw11,
    "df_rw12": df_rw12,
}.items():
    print(name, "has source_file:", "source_file" in df.columns)


df_rw9 has source_file: False
df_rw10 has source_file: False
df_rw11 has source_file: False
df_rw12 has source_file: False


In [None]:
df_rw9["source_file"]  = "RW9"
df_rw10["source_file"] = "RW10"
df_rw11["source_file"] = "RW11"
df_rw12["source_file"] = "RW12"


In [None]:
for name, df in {
    "df_rw9": df_rw9,
    "df_rw10": df_rw10,
    "df_rw11": df_rw11,
    "df_rw12": df_rw12,
}.items():
    print(name, "has source_file:", "source_file" in df.columns)


df_rw9 has source_file: True
df_rw10 has source_file: True
df_rw11 has source_file: True
df_rw12 has source_file: True


In [None]:
df_all = pd.concat(
    [df_rw9, df_rw10, df_rw11, df_rw12],
    ignore_index=True
)



In [None]:
print(df_all["source_file"].value_counts())


source_file
RW11    3992046
RW10    3950492
RW12    3938840
RW9     3908910
Name: count, dtype: int64


In [None]:
df_all["cycle_id"] = (
    df_all["source_file"].astype(str) + "_" +
    df_all["step_id"].astype(str)
)



In [None]:
print("Total unique cycles:", df_all["cycle_id"].nunique())

print("\nCycles per file:")
print(
    df_all.groupby("source_file")["cycle_id"]
    .nunique()
)


Total unique cycles: 111304

Cycles per file:
source_file
RW10    27819
RW11    27441
RW12    27557
RW9     28487
Name: cycle_id, dtype: int64


In [None]:
print(df_all.columns.tolist())


['step_id', 'time_s', 'voltage_V', 'current_A', 'temperature_C', 'cycle_id', 'source_file']


In [None]:
df_all.to_parquet(
    "/content/RFUD_uniform_charge_discharge.parquet",
    index=False
)
