In [67]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from rdkit import Chem
from rdkit.Chem import Descriptors
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [60]:
descriptor_names = [desc[0] for desc in Descriptors.descList]

# カノニカル SMILES に変換する関数
def canonicalize_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return Chem.MolToSmiles(mol, canonical=True)
    return None

# すべての記述子を取得
def compute_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return [Descriptors.__dict__[desc](mol) for desc in descriptor_names]

In [None]:
#--------------------------
#前処理
#--------------------------

# データの読み込み
df = pd.read_csv("data/iupac_high-confidence_v2_2.csv")
print(df.shape)
# 不要な記号を削除（>, <, ~）
df["pka_value"] = df["pka_value"].astype(str).str.replace('>', '').str.replace('<', '').str.replace('~', '').str.strip()

# 数値変換（変換できない値は NaN にする）
df["pka_value"] = pd.to_numeric(df["pka_value"], errors='coerce')

# NaN を削除（文字列だったデータのみ削除）
df = df.dropna(subset=["pka_value"])

# pKa を float に変換
df["pka_value"] = df["pka_value"].astype(float)

# 新しいカラム "canonical_SMILES" を追加
df["canonical_SMILES"] = df["SMILES"].apply(canonicalize_smiles)

# データ確認
print(df.shape)

# カノニカル SMILES の重複数を確認
duplicates = df["canonical_SMILES"].duplicated().sum()
print(f"🔍 重複している分子の数: {duplicates}")

# 2. 重複を削除して最初のデータだけ残す
df_unique = df.drop_duplicates(subset="canonical_SMILES", keep="first")

# データ確認
print(f"✅ 重複除去後のデータ数: {df_unique.shape[0]}")
print(df.shape,df_unique.shape)

(24222, 21)
(24199, 22)
🔍 重複している分子の数: 13512
✅ 重複除去後のデータ数: 10687
(24199, 22) (10687, 22)


In [None]:
# すべての記述子を DataFrame に変換
X = pd.DataFrame([compute_all_descriptors(smiles) for smiles in df_unique["canonical_SMILES"]], columns=descriptor_names)
X["pKa"] = df_unique["pka_value"].values  # pKa を追加
display(X.head())




Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,pKa
0,6.277778,6.277778,0.166667,0.166667,0.297364,6.25,58.084,52.036,58.053098,24,...,0,0,0,0,0,0,0,0,0,12.4
1,4.5,4.5,1.5,1.5,0.384658,3.0,31.058,26.018,31.042199,14,...,0,0,0,0,0,0,0,0,0,10.657
2,9.465278,9.465278,0.055556,-0.467593,0.394253,7.4,74.083,68.035,74.048013,30,...,0,0,0,0,0,0,0,0,0,7.95
3,9.819444,9.819444,0.045139,-0.351852,0.196057,7.666667,89.098,82.042,89.058912,36,...,0,0,0,0,0,0,0,0,0,7.69
4,7.5,7.5,0.125,0.125,0.379084,5.0,56.068,52.036,56.037448,22,...,0,0,0,0,0,0,0,0,0,5.34


In [50]:
# 記述子だけをスケーリング
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.drop(columns=["pKa"]))

# スケーリング後のデータフレーム
X_scaled_df = pd.DataFrame(X_scaled, columns=descriptor_names)
X_scaled_df["pKa"] = X["pKa"]
display(X_scaled_df.head())

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,pKa
0,-0.838742,-0.838742,-0.566061,0.477775,-1.945845,-1.044799,-1.771145,-1.772523,-1.771375,-1.712793,...,-0.102118,-0.084035,-0.053735,-0.038722,-0.078226,-0.016757,-0.07567,-0.122112,-0.104882,12.4
1,-1.443548,-1.443548,2.920097,1.431139,-1.329124,-1.554287,-2.123637,-2.125234,-2.124226,-2.080298,...,-0.102118,-0.084035,-0.053735,-0.038722,-0.078226,-0.016757,-0.07567,-0.122112,-0.104882,10.657
2,0.245657,0.245657,-0.856574,0.024266,-1.261336,-0.864518,-1.562474,-1.555633,-1.562428,-1.492291,...,-0.102118,-0.084035,-0.053735,-0.038722,-0.078226,-0.016757,-0.07567,-0.122112,-0.104882,7.95
3,0.366145,0.366145,-0.88381,0.107023,-2.66156,-0.822714,-1.366638,-1.365748,-1.366337,-1.271788,...,-0.102118,-0.084035,-0.053735,-0.038722,-0.078226,-0.016757,-0.07567,-0.122112,-0.104882,7.69
4,-0.422938,-0.422938,-0.675004,0.447983,-1.368508,-1.240756,-1.797439,-1.772523,-1.797706,-1.786294,...,-0.102118,-0.084035,-0.053735,-0.038722,-0.078226,-0.016757,-0.07567,-0.122112,-0.104882,5.34


In [61]:
# NaN を含む行のインデックスを取得
nan_rows = X_scaled_df.isna().any(axis=1)

In [63]:
X_scaled_df["canonical_SMILES"] = df_unique["canonical_SMILES"].values


In [66]:
# NaN を含む行を削除
X_scaled_df_cleaned = X_scaled_df.dropna()

# 対応する pKa も削除
y_cleaned = X_scaled_df_cleaned["pKa"]
X_cleaned = X_scaled_df_cleaned.drop(columns=["pKa","canonical_SMILES"])

print(f"✅ NaN を削除したデータ数: {X_cleaned.shape[0]}")

✅ NaN を削除したデータ数: 10569


In [71]:
# 相関行列を計算
corr_matrix = X_scaled_df_cleaned.drop(columns=["canonical_SMILES"]).corr()

# pKa と相関の高い特徴量を選択
top_features = corr_matrix["pKa"].abs().sort_values(ascending=False).index[1:21]  # 上位20個

# 選択した特徴量のみを使用
X_selected = X_scaled_df_cleaned[top_features]
y = X_scaled_df_cleaned["pKa"]

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

model_svm = SVR(kernel="rbf", C=10, gamma="scale")
model_svm.fit(X_train, y_train)

y_pred_svm = model_svm.predict(X_test)

print(f"SVM - MSE: {mean_squared_error(y_test, y_pred_svm):.3f}, R²: {r2_score(y_test, y_pred_svm):.3f}")


SVM - MSE: 9.673, R²: 0.447


In [None]:
X_selected = X_scaled_df_cleaned[top_features,"pKa"]


In [None]:
from pycaret.regression import *

# PyCaret のセットアップ（数値データのみを使用）
regression_setup = setup(
    data=X_scaled_df, 
    target="pKa", 
    normalize=True,  # 特徴量の正規化
    train_size=0.8,  # 訓練データの割合
    session_id=42
)
