#初期設定

In [15]:
# google.colabインストール
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#ライブラリのインストール

In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import ParameterSampler

import glob
import os
import re

#編集個所 ↓

In [17]:
# ① ML Modelフォルダの配置 「"/content/drive/MyDrive/ML Model/"」を適宜修正
path = "/content/drive/MyDrive/ML Model/"

# ②学習データ：「data」フォルダに格納したデータのうち、学習用のフォルダ名を入力
train_folder = ["A1", "A2", "A3"]

# ③テストデータ：「data」フォルダに格納したデータのうち、テスト用のフォルダ名を入力
test_folder = ["A4"]

# ④生データのディレクトリパス：「data」フォルダ内の（データごとの）フォルダを以下を参考に追加・削除
# 例）「data」フォルダへ新たにA5フォルダを追加したのなら、下記 directory_paths 内に "A5": path + "data/A5/" を追記する。
directory_paths = {
    "A1": path + "data/A1/",
    "A2": path + "data/A2/",
    "A3": path + "data/A3/",
    "A4": path + "data/A4/"
}

#編集個所↑

In [18]:
# 出力先のディレクトリパス
output_dir = path + "df/"
os.makedirs(output_dir, exist_ok=True)

#特徴量抽出

In [19]:
def process_files(directory_path):
    csv_files = glob.glob(os.path.join(directory_path, "*.csv"))

    # target_x：グラフとの交点x座標
    target_x = [-7500, -4000, 0, 4000, 7500]
    result_data = []

    for file_path in csv_files:
        file_name = os.path.basename(file_path)
        df = pd.read_csv(file_path, header=7, encoding='shift_jis', nrows=1001)

        if "dRh/dH(mΩ/Oe)[*Clip]" in df.columns:
            df.rename(columns={"dRh/dH(mΩ/Oe)[*Clip]": "dRh/dH(mΩ/Oe)"}, inplace=True)

        # 往路と経路分ける
        df_cleaned = df.dropna()
        df_before = df_cleaned.iloc[:500]
        df_after = df_cleaned.iloc[500:]

        intersections = {}
        elm_no_match = re.search(r"ElmNo=(\d+)", file_name)
        intersections["ElmNo"] = int(elm_no_match.group(1)) if elm_no_match else None

        # 交点からどの範囲でデータ採取するか（一旦 交点±200）
        for x in target_x:
            if x in [-7500, 7500]:
                range_before = df_before[(df_before["H(Oe)"] >= x - 200) & (df_before["H(Oe)"] <= x + 200)]
                range_after = df_after[(df_after["H(Oe)"] >= x - 200) & (df_after["H(Oe)"] <= x + 200)]
                value_before = range_before["Rh(Ω)"].mean() if not range_before.empty else np.nan
                value_after = range_after["Rh(Ω)"].mean() if not range_after.empty else np.nan

                if pd.notna(value_before) and pd.notna(value_after):
                    intersections[f"交点_Rh_at_{x}"] = np.mean([value_before, value_after])
                elif pd.notna(value_before):
                    intersections[f"交点_Rh_at_{x}"] = value_before
                elif pd.notna(value_after):
                    intersections[f"交点_Rh_at_{x}"] = value_after
                else:
                    intersections[f"交点_Rh_at_{x}"] = np.nan
            else:
                for segment, label in zip([df_before, df_after], ["before_500", "after_500"]):
                    range_data = segment[(segment["H(Oe)"] >= x - 200) & (segment["H(Oe)"] <= x + 200)]
                    intersections[f"交点_Rh_{label}_at_{x}"] = range_data["Rh(Ω)"].mean() if not range_data.empty else np.nan


        # x=o付近での往路にてグラフの暴れ度を制御
        range_around_zero = df_before[(df_before["H(Oe)"] > -1000) & (df_before["H(Oe)"] < 1000)]
        intersections["max-min diff"] = range_around_zero["Rh(Ω)"].max() - range_around_zero["Rh(Ω)"].min() if not range_around_zero.empty else np.nan


        result_data.append(intersections)

    result_df = pd.DataFrame(result_data).sort_values(by="ElmNo").reset_index(drop=True)
    return result_df

#学習データセット

In [20]:
dfs = []

for i, folder in enumerate(train_folder):
    df = process_files(directory_paths[folder])

    if i == 0:
        dfs.append(df)
    else:
        dfs.append(df.iloc[1:])

combined_train = pd.concat(dfs, ignore_index=True)

combined_train.drop(columns=["ElmNo"], inplace=True)

#テストデータセット

In [21]:
dfs = []

for i, folder in enumerate(test_folder):
    df = process_files(directory_paths[folder])

    if i == 0:
        dfs.append(df)
    else:
        dfs.append(df.iloc[1:])

combined_test = pd.concat(dfs, ignore_index=True)

combined_test.drop(columns=["ElmNo"], inplace=True)

#データフレームを出力

In [22]:
# CSV
combined_train.to_csv(os.path.join(output_dir, "train_X.csv"), index=False, encoding='shift_jis')
combined_test.to_csv(os.path.join(output_dir, "test_X.csv"), index=False, encoding='shift_jis')

# Excel
combined_train.to_excel(os.path.join(output_dir, "train_X.xlsx"), index=False)
combined_test.to_excel(os.path.join(output_dir, "test_X.xlsx"), index=False)

#学習/テストデータセットを変数へ格納

In [23]:
train_X = pd.read_csv(path + "df/train_X.csv", encoding="shift_jis")
train_y = pd.read_csv(path + "df/y_data/train_y.csv", encoding="shift_jis")
test_X = pd.read_csv(path + "df/test_X.csv", encoding="shift_jis")
test_y = pd.read_csv(path + "df/y_data/test_y.csv", encoding="shift_jis")

#ランダムリサーチ

In [24]:
param_grid = {
    'objective': ['binary'],
    'metric': ['binary_error'],
    'boosting_type': ['gbdt'],
    'learning_rate': np.linspace(0.01, 0.2, 20),
    'feature_fraction': np.linspace(0.6, 1.0, 10),
    'num_leaves': range(20, 100, 10),
    'min_data_in_leaf': range(10, 50, 5),
    'early_stopping_round': [20],
    'verbose': [-1],
}

In [25]:
#サンプリング
iter = 50
random_params = list(ParameterSampler(param_grid, n_iter = iter, random_state=1))

best_model = None
best_score = float('inf')

In [26]:
# ランダムリサーチ（本番）
for params in random_params:

    train = lgb.Dataset(train_X, label=train_y.values.ravel())
    test = lgb.Dataset(test_X, label=test_y.values.ravel(), reference=train)

# 学習
    model = lgb.train(
        params=params,
        train_set=train,
        valid_sets=[test],
        valid_names=['valid'],
        num_boost_round=200
    )

    score = model.best_score['valid']['binary_error']
    if score < best_score:
        best_score = score
        best_model = model
        best_params = params

print(best_params)
print(best_score)

{'verbose': -1, 'objective': 'binary', 'num_leaves': 60, 'min_data_in_leaf': 45, 'metric': 'binary_error', 'learning_rate': 0.17, 'feature_fraction': 1.0, 'early_stopping_round': 20, 'boosting_type': 'gbdt'}
0.07142857142857142


# 判定結果出力

In [27]:
# pred_prob = best_model.predict(test_X, num_iteration=best_model.best_iteration)
# pred = (pred_prob > 0.5).astype(int)

# path2 = "/content/drive/MyDrive/ML Model/pred/"

# result_df = pd.DataFrame({
#     "actual": test_y.values.ravel(),
#     "pred_prob": pred_prob,
#     "pred": pred
# })

# output_file = os.path.join(path2, "prediction_results.csv")
# result_df.to_csv(output_file, index=False)

#精度確認

In [28]:
print(f"正解率 : {accuracy_score(test_y, pred)}")
print(f"適合率 : {precision_score(test_y, pred)}")
print(f"再現率 : {recall_score(test_y, pred)}")
print(f"F値 : {f1_score(test_y, pred)}")

正解率 : 0.9285714285714286
適合率 : 0.875
再現率 : 0.9245283018867925
F値 : 0.8990825688073395
