In [43]:
from pathlib import Path
import pandas as pd
import numpy as np


In [44]:

# ============================================================
# 0) ここだけ合わせる：data フォルダの場所
# ============================================================
DATA_ROOT = Path(r".\\data\\test_3_SKAB\\data")
OUT_DIR = Path(r".\data\test_5\SKAB") 
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ============================================================
# 1) なるべく楽に読む（区切り文字は自動推定、文字コードは候補を試す）
# ============================================================
def read_csv_easy(path: Path) -> pd.DataFrame:
    for enc in ["utf-8-sig", "utf-8", "cp932", "shift_jis"]:
        try:
            # sep=None + engine="python" で区切り文字を推定しやすい
            df = pd.read_csv(path, sep=None, engine="python", encoding=enc)
            # 列名の前後スペースだけ除去（よくある原因）
            df.columns = [str(c).strip() for c in df.columns]
            return df
        except Exception:
            pass
    raise RuntimeError(f"読めませんでした: {path}")

# ============================================================
# 2) 学習/テストを自動で集める
#    - train: anomaly-free/anomaly-free.csv
#    - test : valve1/*.csv, valve2/*.csv, other/*.csv を全部結合
#    - feature_cols: train の「数値列」から time 系を除いて自動決定
# ============================================================
train_path = DATA_ROOT / "anomaly-free" / "anomaly-free.csv"
df_train = read_csv_easy(train_path)

test_files = []
for sub in ["valve1", "valve2", "other"]:
    test_files += sorted((DATA_ROOT / sub).glob("*.csv"))

df_test_list = [read_csv_easy(p) for p in test_files]
df_test_all = pd.concat(df_test_list, ignore_index=True) if df_test_list else pd.DataFrame()

# ---- 特徴量列を自動決定（train基準）
time_like = ("time", "timestamp", "date")
num_cols = df_train.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if not any(k in c.lower() for k in time_like)]

print(f"train rows={len(df_train)}, test files={len(test_files)}, test rows={len(df_test_all)}")
print(f"features ({len(feature_cols)}): {feature_cols}")


train rows=9405, test files=34, test rows=37401
features (8): ['Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure', 'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS']


In [45]:

# ============================================================
# 3) テスト側で列が揃わないファイルは落とす（楽な運用：揃ってるものだけ使う）
# ============================================================
ok_parts = []
skipped = []
for p, d in zip(test_files, df_test_list):
    missing = [c for c in feature_cols if c not in d.columns]
    if missing:
        skipped.append((str(p), missing))
        continue
    ok_parts.append(d)

df_test = pd.concat(ok_parts, ignore_index=True) if ok_parts else pd.DataFrame()


In [46]:
# ============================================================
# 2.5) 正常データ（anomaly-free）を「学習用」と「テスト正常用」に分割（重複なし）
# ============================================================

N_total = len(df_train)

# 学習に使う正常データ数（例：全体の60%）
N_TRAIN_NORMAL = int(N_total * 0.6)

# テストに入れる正常データ数（足りなければ残り全部）
N_TEST_NORMAL = min(N_total - N_TRAIN_NORMAL, 5000)

# 分割（重複なし）
df_train_normal = df_train.iloc[:N_TRAIN_NORMAL].copy()  # 学習（正常）
df_test_normal  = df_train.iloc[N_TRAIN_NORMAL:N_TRAIN_NORMAL + N_TEST_NORMAL].copy()  # テスト正常

# ラベル
df_train_normal["__label__"] = 0
df_test_normal["__label__"]  = 0

print("df_train_normal:", df_train_normal.shape)
print("df_test_normal :", df_test_normal.shape)


df_train_normal: (5643, 10)
df_test_normal : (3762, 10)


In [47]:

# ============================================================
# 3.5) テストに「正常 + 異常」を混ぜる（正常は df_test_normal、異常は df_test）
# ============================================================

df_test_abn = df_test.copy()
df_test_abn["__label__"] = 1

df_test_mix = pd.concat([df_test_normal, df_test_abn], ignore_index=True)

print("df_test_abn   :", df_test_abn.shape)
print("df_test_mix   :", df_test_mix.shape)

# MMDに渡す行列（DataFrameのままでもOK）
# X_train_df = df_train_normal[feature_cols].copy()
# X_test_df  = df_test_mix[feature_cols].copy()
# y_test     = df_test_mix["__label__"].to_numpy(dtype=int)

cols = [
    'Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure',
    'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS',
    '__label__'
]

X_train_df = df_train_normal[cols]
X_test_df  = df_test_mix[cols]

print("X_train_df:", X_train_df.shape)
print("X_test_df :", X_test_df.shape)



df_test_abn   : (37401, 12)
df_test_mix   : (41163, 12)
X_train_df: (5643, 9)
X_test_df : (41163, 9)


In [48]:
print(df_test_abn.columns.tolist())

['datetime', 'Accelerometer1RMS', 'Accelerometer2RMS', 'Current', 'Pressure', 'Temperature', 'Thermocouple', 'Voltage', 'Volume Flow RateRMS', 'anomaly', 'changepoint', '__label__']


In [49]:

# ============================================================
# 4) X_train / X_test を DataFrame で作成し、CSVに保存
# ============================================================
# X_train_df = X_train_df[feature_cols].copy()
# X_test_df  = X_test_df[feature_cols].copy() if not df_test.empty else pd.DataFrame(columns=feature_cols)


X_train_out = OUT_DIR / "X_train.csv"
X_test_out  = OUT_DIR / "X_test.csv"

X_train_df.to_csv(X_train_out, index=False, encoding="utf-8-sig")
X_test_df.to_csv(X_test_out, index=False, encoding="utf-8-sig")

print("saved:", X_train_out)
print("saved:", X_test_out)

if skipped:
    print("\n[skipped test files (missing columns)] first 10:")
    for f, miss in skipped[:10]:
        print(" -", f)
        print("   missing:", miss)

# 以後、この2つを使えばOK
# X_train_df, X_test_df
print("X_train_df shape:", X_train_df.shape)
print("X_test_df  shape:", X_test_df.shape)


saved: data\test_5\SKAB\X_train.csv
saved: data\test_5\SKAB\X_test.csv
X_train_df shape: (5643, 9)
X_test_df  shape: (41163, 9)
