In [99]:
import japanize_matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from adjustText import adjust_text
from pandas import json_normalize
from pysr import PySRRegressor # 初回のimportでは時間がかかる
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [100]:
plt.rcParams["font.size"] = 20
plt.rcParams["axes.labelsize"] = 20
plt.rcParams["xtick.labelsize"] = 20
plt.rcParams["ytick.labelsize"] = 20
plt.rcParams["axes.titlesize"] = 20
plt.rcParams["font.family"] = "IPAexGothic"  #'Times New Roman'

In [101]:
class CFG:
    input_dir = "../input"
    output_dir = "../output"
    random_seed = 42
    exp_no = "symboric"
    target = "粘度"

In [102]:
# データフレームの読み込み
train_df = pd.read_csv(f"{CFG.input_dir}/chapter2_demo_data_postprocess.csv")
# 備考にrefと書かれているデータを削除
train_df = train_df[train_df["備考"] != "ref"]
# インデックスを振り直す
train_df = train_df.reset_index(drop=True)

train_df

Unnamed: 0,サンプルID,サンプル作成日,担当者,備考,材料１,材料２,材料３,材料４,材料５,材料６,...,乾燥温度,乾燥時間,擦過回数,擦過圧力,耐擦過性(n10,画像濃度,粘度,表面張力,保存後粘度,保存後状態
0,DP_001,45310,高岡,,46.9,40,0,2,0,2.0,...,25,,10,5,2.4,1.06,9.0,35,,
1,DP_002,45310,高岡,,47.4,40,0,2,0,1.5,...,25,,10,5,2.4,1.07,9.0,38,,
2,DP_003,45310,高岡,,47.7,40,0,2,0,1.2,...,25,,10,5,2.4,1.07,9.0,40,,
3,DP_004,45310,高岡,,47.9,40,0,2,0,1.0,...,25,,10,5,2.4,1.08,9.0,41,,
4,DP_005,45310,高岡,,48.4,40,0,2,0,0.5,...,25,,10,5,2.4,1.09,9.0,44,,
5,DP_007,45321,福原,,47.4,40,0,2,0,0.0,...,25,,10,5,2.7,1.05,9.1,37,,
6,DP_008,45321,福原,,47.4,40,0,2,0,0.0,...,25,,10,5,2.7,1.02,9.1,39,,
7,DP_009,45321,福原,,45.4,40,0,2,0,1.5,...,25,,10,5,3.1,1.07,10.3,36,,
8,DP_010,45321,福原,,43.4,40,0,2,0,1.5,...,25,,10,5,3.6,1.07,11.5,34,,
9,DP_011,45321,福原,,41.4,40,0,2,0,1.5,...,25,,10,5,4.0,1.07,12.7,32,,


In [103]:
# train_dfの列名をリストで取得
columns = train_df.columns
# train_dfの列名を表示
columns

Index(['サンプルID', 'サンプル作成日', '担当者', '備考', '材料１', '材料２', '材料３', '材料４', '材料５',
       '材料６', '材料７', '材料８', '材料９', '材料１０', '材料１１', '材料１２', '材料１３', '塗布量',
       '乾燥方式', '乾燥温度', '乾燥時間', '擦過回数', '擦過圧力', '耐擦過性(n10', '画像濃度', '粘度',
       '表面張力', '保存後粘度', '保存後状態'],
      dtype='object')

In [104]:
# 欠損のある列を表示
train_df.isnull().sum()

サンプルID       0
サンプル作成日      0
担当者          0
備考          30
材料１          0
材料２          0
材料３          0
材料４          0
材料５          0
材料６          0
材料７          0
材料８          0
材料９          0
材料１０         0
材料１１         0
材料１２         0
材料１３         0
塗布量          0
乾燥方式         0
乾燥温度         0
乾燥時間        13
擦過回数         0
擦過圧力         0
耐擦過性(n10     0
画像濃度         0
粘度           0
表面張力         0
保存後粘度       24
保存後状態       24
dtype: int64

In [105]:
numeric_parameter_cols = [
    # 'サンプルID', 'サンプル名', 'サンプル作成日', '担当者', '備考',
    "材料１",
    "材料２",
    "材料３",
    "材料４",
    "材料５",
    "材料６",
    "材料７",
    "材料８",
    "材料９",
    "材料１０",
    "材料１１",
    "材料１２",
    "材料１３",
    # "塗布量",
    # "乾燥方式",
    # "乾燥温度",
    # '乾燥時間',
    # "擦過回数",
    # "擦過圧力",
    # '耐擦過性(n10','画像濃度', '粘度', '表面張力', '保存後粘度'
]

In [106]:
# 変数名が日本語だとエラーが出るため、変数名を英語に変換、材料１〜材料１３をmaterial1〜material13に変換、粘度をviscosityに変換
train_df.columns = [
    "sample_id",
    "sample_name",
    "sample_creation_date",
    "person_in_charge",
    "remarks",
    "material1",
    "material2",
    "material3",
    "material4",
    "material5",
    "material6",
    "material7",
    "material8",
    "material9",
    "material10",
    "material11",
    "material12",
    "material13",
    "coating_amount",
    "drying_method",
    "drying_temperature",
    "drying_time",
    "abrasion_count",
    "abrasion_pressure",
    "abrasion_resistance",
    "image_density",
    "viscosity",
    "surface_tension",
    "viscosity_after_storage",
]

train_df

Unnamed: 0,sample_id,sample_name,sample_creation_date,person_in_charge,remarks,material1,material2,material3,material4,material5,...,drying_method,drying_temperature,drying_time,abrasion_count,abrasion_pressure,abrasion_resistance,image_density,viscosity,surface_tension,viscosity_after_storage
0,DP_001,45310,高岡,,46.9,40,0,2,0,2.0,...,25,,10,5,2.4,1.06,9.0,35,,
1,DP_002,45310,高岡,,47.4,40,0,2,0,1.5,...,25,,10,5,2.4,1.07,9.0,38,,
2,DP_003,45310,高岡,,47.7,40,0,2,0,1.2,...,25,,10,5,2.4,1.07,9.0,40,,
3,DP_004,45310,高岡,,47.9,40,0,2,0,1.0,...,25,,10,5,2.4,1.08,9.0,41,,
4,DP_005,45310,高岡,,48.4,40,0,2,0,0.5,...,25,,10,5,2.4,1.09,9.0,44,,
5,DP_007,45321,福原,,47.4,40,0,2,0,0.0,...,25,,10,5,2.7,1.05,9.1,37,,
6,DP_008,45321,福原,,47.4,40,0,2,0,0.0,...,25,,10,5,2.7,1.02,9.1,39,,
7,DP_009,45321,福原,,45.4,40,0,2,0,1.5,...,25,,10,5,3.1,1.07,10.3,36,,
8,DP_010,45321,福原,,43.4,40,0,2,0,1.5,...,25,,10,5,3.6,1.07,11.5,34,,
9,DP_011,45321,福原,,41.4,40,0,2,0,1.5,...,25,,10,5,4.0,1.07,12.7,32,,


In [107]:
# numeric_parameter_colsも英語に変換
numeric_parameter_cols = [
    "material1",
    "material2",
    "material3",
    "material4",
    "material5",
    "material6",
    "material7",
    "material8",
    "material9",
    "material10",
    "material11",
    "material12",
    "material13",
]

# CFG.targetをviscosityに変更
CFG.target = "viscosity"

In [122]:
# シンボリック回帰のモデルを作成
model = PySRRegressor()

# シンボリック回帰のモデルをtrain_dfで学習
model.fit(train_df[numeric_parameter_cols], train_df[CFG.target])

# シンボリック回帰のモデルを使って予測
y_pred_train = model.predict(train_df[numeric_parameter_cols])
# y_pred_valid = model.predict(test_df[numeric_parameter_cols])

# MAEを計算
mae_train = mean_absolute_error(train_df[CFG.target], y_pred_train)
# mae_valid = mean_absolute_error(test_df[CFG.target], y_pred_valid)

# RMSEを計算
rmse_train = np.sqrt(mean_squared_error(train_df[CFG.target], y_pred_train))
# rmse_valid = np.sqrt(mean_squared_error(test_df[CFG.target], y_pred_valid))

# R2を計算
r2_train = r2_score(train_df[CFG.target], y_pred_train)
# r2_valid = r2_score(test_df[CFG.target], y_pred_valid)

# 結果を表示
print(f"train MAE: {mae_train:.4f}")
# print(f"valid MAE: {mae_valid:.4f}")
print(f"train RMSE: {rmse_train:.4f}")
# print(f"valid RMSE: {rmse_valid:.4f}")
print(f"train R2: {r2_train:.4f}")
# print(f"valid R2: {r2_valid:.4f}")

[ Info: Started!


train MAE: 1.5244
train RMSE: 2.2351
train R2: 0.5113


In [123]:
model

In [124]:
model.get_best()

complexity                                                       5
loss                                                      4.995554
score                                                     0.294351
equation                     (42.266666 - material10) - material11
sympy_format                  -material10 - material11 + 42.266666
lambda_format    PySRFunction(X=>-material10 - material11 + 42....
Name: 2, dtype: object

In [126]:
model.equations_["equation"][8]

'((((material6 + material7) + material5) * -2.8930063) + 0.19284849) + ((46.336163 - material6) - (material11 + material10))'

In [125]:
# model.equations_["equation"][2]
model.get_best()["equation"]

'(42.266666 - material10) - material11'