In [87]:
import os
import warnings
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

warnings.filterwarnings("ignore")

PROCESSED_DATA_PATH = '../data/processed/'

### Настройка и Загрузка

In [88]:
# Загрузка обработанных данных
try:
    train_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_processed.csv'))
    test_df = pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'test_processed.csv'))
    print(f"Размер тренировочных данных: {train_df.shape}")
    print(f"Размер тестовых данных: {test_df.shape}")
except FileNotFoundError as e:
    print("Ошибка: Убедитесь, что файлы train_processed.csv и test_processed.csv находятся в папке ../data/processed/")
    print("Пожалуйста, запустите сначала ноутбук 1_data_preparation.ipynb.")
    raise

Размер тренировочных данных: (2064, 221)
Размер тестовых данных: (888, 221)


In [89]:

# добавим общие импорты, чтобы не дублировать их в ячейках и снизить предупреждения
import numpy as np


### RDkit Features

In [90]:
from typing import List

def rdkit_descriptors(smiles: List[str]) -> pd.DataFrame:
    if isinstance(smiles, pd.Series):
        idx = smiles.index
        smiles_list = smiles.tolist()
    else:
        idx = None
        smiles_list = list(smiles)

    calc = MoleculeDescriptors.MolecularDescriptorCalculator(
        [x[0] for x in Descriptors._descList]
    )
    desc_names = calc.GetDescriptorNames()

    Mol_descriptors = []
    for smi in smiles_list:
        mol = None
        try:
            mol = Chem.MolFromSmiles(smi)
        except Exception:
            mol = None

        if mol is None:
            # Сохраняем NaN строку чтобы размерность совпадала с входом
            Mol_descriptors.append([np.nan] * len(desc_names))
            continue

        try:
            mol_h = Chem.AddHs(mol)
            descriptors_vals = calc.CalcDescriptors(mol_h)
            Mol_descriptors.append(descriptors_vals)
        except Exception:
            Mol_descriptors.append([np.nan] * len(desc_names))

    df = pd.DataFrame(Mol_descriptors, columns=desc_names)
    df = df.apply(pd.to_numeric, errors='coerce')
    if idx is not None:
        df.index = idx
    print("Initial shape:", df.shape)

    return df

In [91]:
train_df_rdkit = rdkit_descriptors(train_df.SMILES)
train_df_rdkit

Initial shape: (2064, 217)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,,,,,,,,,,,...,,,,,,,,,,
2060,,,,,,,,,,,...,,,,,,,,,,
2061,,,,,,,,,,,...,,,,,,,,,,
2062,,,,,,,,,,,...,,,,,,,,,,


In [92]:
test_df_rdkit = rdkit_descriptors(test_df.SMILES)
test_df_rdkit

Initial shape: (888, 217)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,,,,,,,,,,,...,,,,,,,,,,
884,,,,,,,,,,,...,,,,,,,,,,
885,,,,,,,,,,,...,,,,,,,,,,
886,,,,,,,,,,,...,,,,,,,,,,


### Mordred Features

In [93]:
from typing import List

import pandas as pd
import warnings
import importlib

mordred_mod = None
try:
    mordred_mod = importlib.import_module("mordred")
    Calculator = getattr(mordred_mod, "Calculator")
    descriptors = getattr(mordred_mod, "descriptors")
    _HAS_MORDRED = True
except Exception:
    _HAS_MORDRED = False
    warnings.warn("mordred is not installed; mordred_descriptors will use an RDKit-based fallback.")

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors


def mordred_descriptors(smiles: List[str]) -> pd.DataFrame:

    if isinstance(smiles, pd.Series):
        idx = smiles.index
        smiles_list = smiles.tolist()
    else:
        idx = None
        smiles_list = list(smiles)

    if not _HAS_MORDRED:
        calc = MoleculeDescriptors.MolecularDescriptorCalculator(
            [x[0] for x in Descriptors._descList]
        )
        desc_names = calc.GetDescriptorNames()

        desc_rows = []
        for smi in smiles_list:
            try:
                mol = Chem.MolFromSmiles(smi)
            except Exception:
                mol = None

            if mol is None:
                desc_rows.append([np.nan] * len(desc_names))
                continue

            try:
                vals = calc.CalcDescriptors(mol)
                desc_rows.append(vals)
            except Exception:
                desc_rows.append([np.nan] * len(desc_names))

        df = pd.DataFrame(desc_rows, columns=desc_names)
        df = df.apply(pd.to_numeric, errors="coerce")
        if idx is not None:
            df.index = idx
        print("Fallback (RDKit) mordred-like descriptors shape:", df.shape)
        return df

    calc = Calculator(descriptors, ignore_3D=True)

    desc_names = None
    desc_rows = []

    for smi in smiles_list:
        try:
            mol = Chem.MolFromSmiles(smi)
        except Exception:
            mol = None

        if mol is None:

            desc_rows.append(None)
            continue

        try:
            mol_h = Chem.AddHs(mol)

            single_df = calc.pandas([mol_h])
            single_df = single_df.apply(pd.to_numeric, errors="coerce")
            if desc_names is None:
                desc_names = single_df.columns.tolist()
            desc_rows.append(single_df.iloc[0].tolist())
        except Exception:
            desc_rows.append(None)

    if desc_names is None:

        calc_rd = MoleculeDescriptors.MolecularDescriptorCalculator(
            [x[0] for x in Descriptors._descList]
        )
        desc_names = calc_rd.GetDescriptorNames()

    final_rows = []
    for r in desc_rows:
        if r is None:
            final_rows.append([np.nan] * len(desc_names))
        else:
            if len(r) != len(desc_names):
                row = list(r)[: len(desc_names)] + [np.nan] * max(0, len(desc_names) - len(r))
                final_rows.append(row)
            else:
                final_rows.append(r)

    df = pd.DataFrame(final_rows, columns=desc_names)
    df = df.apply(pd.to_numeric, errors="coerce")
    if idx is not None:
        df.index = idx
    print("Initial shape:", df.shape)

    return df

In [94]:
train_df_mordred = mordred_descriptors(train_df.SMILES)
train_df_mordred

Fallback (RDKit) mordred-like descriptors shape: (2064, 217)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,,,,,,,,,,,...,,,,,,,,,,
2060,,,,,,,,,,,...,,,,,,,,,,
2061,,,,,,,,,,,...,,,,,,,,,,
2062,,,,,,,,,,,...,,,,,,,,,,


In [95]:
test_df_mordred = mordred_descriptors(test_df.SMILES)
test_df_mordred

Fallback (RDKit) mordred-like descriptors shape: (888, 217)


Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,,,,,,,,,,,...,,,,,,,,,,
884,,,,,,,,,,,...,,,,,,,,,,
885,,,,,,,,,,,...,,,,,,,,,,
886,,,,,,,,,,,...,,,,,,,,,,


### MorganFp

In [96]:
from typing import List

from rdkit.Chem import AllChem
from rdkit import DataStructs


def morgan_fpts(
    smiles: List[str], rad: int = 2, bits: int = 1024
) -> pd.DataFrame:
    morgan_fpts = []
    if isinstance(smiles, pd.Series):
        idx = smiles.index
        smiles_list = smiles.tolist()
    else:
        idx = None
        smiles_list = list(smiles)

    for smi in smiles_list:
        if pd.isna(smi):
            morgan_fpts.append(np.zeros(bits, dtype=int))
            continue

        try:
            mol = Chem.MolFromSmiles(smi)
        except Exception:
            mol = None

        if mol is None:
            morgan_fpts.append(np.zeros(bits, dtype=int))
            continue

        try:
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, radius=rad, nBits=bits)
            arr = np.zeros((bits,), dtype=int)
            DataStructs.ConvertToNumpyArray(fpts, arr)
            morgan_fpts.append(arr.copy())
        except Exception:
            morgan_fpts.append(np.zeros(bits, dtype=int))

    df = pd.DataFrame(morgan_fpts, columns=[f"MorgFPT_{i}" for i in range(bits)])
    if idx is not None:
        df.index = idx
    return df


In [97]:
Morgan_fingerprints_train = morgan_fpts(train_df.SMILES, rad=2, bits=1024)
Morgan_fingerprints_train

Unnamed: 0,MorgFPT_0,MorgFPT_1,MorgFPT_2,MorgFPT_3,MorgFPT_4,MorgFPT_5,MorgFPT_6,MorgFPT_7,MorgFPT_8,MorgFPT_9,...,MorgFPT_1014,MorgFPT_1015,MorgFPT_1016,MorgFPT_1017,MorgFPT_1018,MorgFPT_1019,MorgFPT_1020,MorgFPT_1021,MorgFPT_1022,MorgFPT_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
Morgan_fingerprints_test = morgan_fpts(test_df.SMILES, rad=2, bits=1024)
Morgan_fingerprints_test

Unnamed: 0,MorgFPT_0,MorgFPT_1,MorgFPT_2,MorgFPT_3,MorgFPT_4,MorgFPT_5,MorgFPT_6,MorgFPT_7,MorgFPT_8,MorgFPT_9,...,MorgFPT_1014,MorgFPT_1015,MorgFPT_1016,MorgFPT_1017,MorgFPT_1018,MorgFPT_1019,MorgFPT_1020,MorgFPT_1021,MorgFPT_1022,MorgFPT_1023
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### AvalonFp

In [99]:
import numpy as np
import pandas as pd
from typing import List, Union
from rdkit import Chem
from rdkit.Avalon import pyAvalonTools
from tqdm.auto import tqdm


def generate_AVfpts(smiles_list: Union[List[str], pd.Series], bits: int = 4096) -> pd.DataFrame:
    """
    Вычисляет Avalon fingerprints для списка SMILES-строк
    и удаляет пустые/постоянные признаки.

    Args:
        smiles_list (List[str] или pd.Series): список SMILES
        bits (int): размерность фингерпринтов

    Returns:
        pd.DataFrame: таблица бинарных признаков (без константных столбцов)
    """
    if isinstance(smiles_list, pd.Series):
        smiles_list = smiles_list.tolist()

    avalon_fpts = []
    failed_idx = []

    for i, smi in enumerate(tqdm(smiles_list, desc="AvalonFP")):
        # обработка NaN
        if pd.isna(smi):
            avalon_fpts.append(np.zeros(bits, dtype=int))
            failed_idx.append(i)
            continue

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            avalon_fpts.append(np.zeros(bits, dtype=int))
            failed_idx.append(i)
            continue

        fpt = pyAvalonTools.GetAvalonFP(mol, nBits=bits)
        avalon_fpts.append(np.array(fpt))

    df = pd.DataFrame(
        avalon_fpts, 
        columns=[f"Col_A_{i+1}" for i in range(bits)]
    )

    if failed_idx:
        print(f"[!] Не удалось распарсить {len(failed_idx)} SMILES. "
              f"Индексы (пример): {failed_idx[:10]}")

    return df


In [100]:
Avalon_fingerprints_train = generate_AVfpts(train_df["SMILES"], bits=4096)
Avalon_fingerprints_train

AvalonFP:   0%|          | 0/2064 [00:00<?, ?it/s]

[!] Не удалось распарсить 2064 SMILES. Индексы (пример): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Unnamed: 0,Col_A_1,Col_A_2,Col_A_3,Col_A_4,Col_A_5,Col_A_6,Col_A_7,Col_A_8,Col_A_9,Col_A_10,...,Col_A_4087,Col_A_4088,Col_A_4089,Col_A_4090,Col_A_4091,Col_A_4092,Col_A_4093,Col_A_4094,Col_A_4095,Col_A_4096
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2060,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2061,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2062,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [101]:
Avalon_fingerprints_test = generate_AVfpts(test_df["SMILES"], bits=4096)
Avalon_fingerprints_test

AvalonFP:   0%|          | 0/888 [00:00<?, ?it/s]

[!] Не удалось распарсить 888 SMILES. Индексы (пример): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


Unnamed: 0,Col_A_1,Col_A_2,Col_A_3,Col_A_4,Col_A_5,Col_A_6,Col_A_7,Col_A_8,Col_A_9,Col_A_10,...,Col_A_4087,Col_A_4088,Col_A_4089,Col_A_4090,Col_A_4091,Col_A_4092,Col_A_4093,Col_A_4094,Col_A_4095,Col_A_4096
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
885,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
886,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [102]:
full_train_df = pd.concat([train_df, train_df_rdkit, train_df_mordred, Morgan_fingerprints_train, Avalon_fingerprints_train], axis=1)
full_test_df = pd.concat([test_df, test_df_rdkit, test_df_mordred, Morgan_fingerprints_test, Avalon_fingerprints_test], axis=1)

In [103]:
full_train_df

Unnamed: 0,PubChem_ID,taste_cluster,SMILES,SMILES_standardized,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,...,Col_A_4087,Col_A_4088,Col_A_4089,Col_A_4090,Col_A_4091,Col_A_4092,Col_A_4093,Col_A_4094,Col_A_4095,Col_A_4096
0,95609,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,104224,0.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,5284499,6.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,10886,4.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,12978217,3.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,229385,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2060,62387,2.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2061,8878,7.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2062,61262,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [104]:
full_test_df

Unnamed: 0,PubChem_ID,taste_cluster,SMILES,SMILES_standardized,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,...,Col_A_4087,Col_A_4088,Col_A_4089,Col_A_4090,Col_A_4091,Col_A_4092,Col_A_4093,Col_A_4094,Col_A_4095,Col_A_4096
0,32594,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,7130,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,7455,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,31226,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,17000,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883,26334,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
884,104421,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
885,21263168,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
886,5368236,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


### Приведем иксы

In [105]:
full_train_df = full_train_df.drop(columns=[c for c in ['PubChem_ID', 'taste_cluster', 'SMILES_st'] if c in full_train_df.columns])
full_test_df = full_test_df.drop(columns=[c for c in ['PubChem_ID', 'SMILES_st'] if c in full_test_df.columns])

In [106]:
full_train_df = full_train_df.select_dtypes(include=[np.number])
full_test_df = full_test_df.select_dtypes(include=[np.number])

In [107]:
def process_nan_train(
    df_train: pd.DataFrame,
    threshold: float,
    fill_method: str,
    fill_value: float | int = 0,
):
    """
    Обработка NaN на train:
    - выбираем колонки, где доля NaN < threshold
    - считаем значения для заполнения NaN (по train)
    Возвращаем:
        df_train_clean, cols_to_keep, fill_values
    """
    # доля NaN только по train
    nan_ratio = df_train.isna().mean()
    cols_to_keep = nan_ratio[nan_ratio <= threshold].index.tolist()

    df_train_reduced = df_train[cols_to_keep].copy()

    if fill_method == "zero":
        fill_values = 0

    elif fill_method == "mean":
        fill_values = df_train_reduced.mean(numeric_only=True)

    elif fill_method == "median":
        fill_values = df_train_reduced.median(numeric_only=True)

    elif fill_method == "constant":
        fill_values = fill_value

    else:
        raise ValueError(
            f"Неизвестный метод fill_method='{fill_method}'. "
            "Используйте: 'zero', 'mean', 'median', 'constant'."
        )

    df_train_clean = df_train_reduced.fillna(fill_values)

    return df_train_clean, cols_to_keep, fill_values


def process_nan_apply(
    df: pd.DataFrame,
    cols_to_keep,
    fill_values,
):
    """
    Применяет параметры, посчитанные на train:
    - выбирает те же cols_to_keep
    - заполняет NaN теми же fill_values
    """
    df_reduced = df[cols_to_keep].copy()
    df_clean = df_reduced.fillna(fill_values)
    return df_clean


In [108]:
# --- NaN обработка для threshold=0.5 ---
full_train_df_processed_nan_50, cols_50, fill_vals_50 = process_nan_train(
    full_train_df,
    threshold=0.5,
    fill_method="zero"
)

full_test_df_processed_nan_50 = process_nan_apply(
    full_test_df,
    cols_to_keep=cols_50,
    fill_values=fill_vals_50,
)

full_train_df_processed_nan_50.shape, full_test_df_processed_nan_50.shape


((2064, 5771), (888, 5771))

In [109]:
full_train_df_processed_nan_80, cols_80, fill_vals_80 = process_nan_train(
    full_train_df,
    threshold=0.2,
    fill_method="zero",
)
full_test_df_processed_nan_80 = process_nan_apply(
    full_test_df,
    cols_to_keep=cols_80,
    fill_values=fill_vals_80,
)

full_train_df_processed_nan_80.shape, full_test_df_processed_nan_80.shape

((2064, 5771), (888, 5771))

In [110]:
full_train_df_processed_nan_100, cols_100, fill_vals_100 = process_nan_train(
    full_train_df,
    threshold=0,
    fill_method="zero",
)
full_test_df_processed_nan_100 = process_nan_apply(
    full_test_df,
    cols_to_keep=cols_100,
    fill_values=fill_vals_100,
)

full_train_df_processed_nan_100.shape, full_test_df_processed_nan_100.shape

((2064, 5771), (888, 5771))

### Обработка константных столбцов

In [111]:
const_mask_100 = full_train_df_processed_nan_100.nunique(dropna=False) > 1

full_train_df_processed_100 = full_train_df_processed_nan_100.loc[:, const_mask_100]
full_test_df_processed_100  = full_test_df_processed_nan_100.loc[:, const_mask_100]

full_train_df_processed_100.shape, full_test_df_processed_100.shape

((2064, 0), (888, 0))

### train test

In [112]:
from sklearn.model_selection import train_test_split

# Use explicit target Series taken from train_df to avoid unresolved-name issues
if 'taste_cluster' not in train_df.columns:
    raise KeyError("Колонка 'taste_cluster' отсутствует в train_df")

y_all = train_df.loc[full_train_df_processed_100.index, 'taste_cluster']

X_train, X_test, y_train, y_test = train_test_split(
    full_train_df_processed_100,
    y_all,
    test_size=0.2,
    random_state=42,
    stratify=y_all
)

### Scaler + обработка признаков с высокой корреляцией

In [113]:
'''# 1) Обучаем скейлер и PCA на train
X_train_pca, scaler, pca = apply_scaler_pca(
    X_train,
    scaler_name="standard",
    use_pca=True,
    scaler=None,
    pca=None
)

# 2) Применяем тот же scaler и pca к test
X_test_pca, _, _ = apply_scaler_pca(
    X_test,
    scaler_name="standard",
    use_pca=True,
    scaler=scaler,
    pca=pca
)

# 3) Применяем тот же scaler и pca к full_test_df_processed_
orig_test_pca, _, _ = apply_scaler_pca(
    full_test_df_processed_100,
    scaler_name="standard",
    use_pca=True,
    scaler=scaler,
    pca=pca
)'''

'# 1) Обучаем скейлер и PCA на train\nX_train_pca, scaler, pca = apply_scaler_pca(\n    X_train,\n    scaler_name="standard",\n    use_pca=True,\n    scaler=None,\n    pca=None\n)\n\n# 2) Применяем тот же scaler и pca к test\nX_test_pca, _, _ = apply_scaler_pca(\n    X_test,\n    scaler_name="standard",\n    use_pca=True,\n    scaler=scaler,\n    pca=pca\n)\n\n# 3) Применяем тот же scaler и pca к full_test_df_processed_\norig_test_pca, _, _ = apply_scaler_pca(\n    full_test_df_processed_100,\n    scaler_name="standard",\n    use_pca=True,\n    scaler=scaler,\n    pca=pca\n)'

In [114]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    MaxAbsScaler,
    Normalizer
)
from sklearn.decomposition import PCA


def apply_scaler_pca(
    df: pd.DataFrame,
    scaler_name: str,
    use_pca: bool = True,
    scaler=None,
    pca=None,
    explained_variance: float = 0.9,
    corr_threshold: float | None = None,
):
    """
    Применяет выбранный скейлер и (опционально) PCA к числовым данным DataFrame

    Особенности
    - Можно задать долю объяснённой дисперсии для PCA (explained_variance, от 0 до 1)
    - Можно задать порог корреляции corr_threshold
      PCA применяется только к тем числовым признакам, у которых есть |corr| >= corr_threshold
      хотя бы с одним другим признаком
      Остальные числовые признаки просто масштабируются (если выбран скейлер) и остаются как есть

    Параметры
        df (pd.DataFrame): входной датафрейм
        scaler_name (str): название скейлера
            standard  - StandardScaler
            minmax    - MinMaxScaler
            robust    - RobustScaler
            maxabs    - MaxAbsScaler
            normalize - Normalizer
            none      - не применять скейлер
        use_pca (bool): применять ли PCA
        scaler: уже обученный скейлер (для test)
                Если None — обучается на df
        pca: уже обученный PCA (для test)
             Если None и use_pca=True — обучается на df
        explained_variance (float): доля объяснённой дисперсии для PCA (0 < v <= 1)
        corr_threshold (float|None): порог по |corr| (0–1)
            Если None — PCA применяется ко всем числовым признакам
            Если задан — PCA применяется только к признакам, у которых есть
            хотя бы одна корреляция по ��одулю >= corr_threshold с другим числовым признаком

    Возвращает
        df_out (pd.DataFrame): преобразованный датафрейм
            - нечисловые столбцы сохраняются как есть
            - числовые, не попавшие в PCA, остаются (в масштабированном виде, если был скейлер)
            - вместо числовых, попавших в PCA, появляются PC1, PC2, ...
        scaler: объект обученного скейлера (или None)
        pca: объект обученного PCA (или None)
             В него дополнительно дописывается атрибут pca_input_features_
             список имен признаков, поданных на вход PCA
    """

    # Копируем, чтобы не трогать исходный df
    df_out = df.copy()

    # Явно берём только числовые столбцы
    X_num_df = df_out.select_dtypes(include=["number"])
    numeric_cols = X_num_df.columns.tolist()

    # Если числовых признаков нет — просто ��озвращаем как есть
    if len(numeric_cols) == 0:
        return df_out, None, None

    # Массив числовых данных
    X_num = X_num_df.to_numpy()

    # ---------- ШАГ 1. Масштабирование ----------
    if scaler_name == "none":
        X_scaled = X_num
        scaler_out = None
    else:
        if scaler is None:
            # TRAIN — инициализация и обучение скейлера
            if scaler_name == "standard":
                scaler = StandardScaler()
            elif scaler_name == "minmax":
                scaler = MinMaxScaler()
            elif scaler_name == "robust":
                scaler = RobustScaler()
            elif scaler_name == "maxabs":
                scaler = MaxAbsScaler()
            elif scaler_name == "normalize":
                scaler = Normalizer()
            else:
                raise ValueError(
                    f"Неизвестный scaler_name='{scaler_name}'. "
                    f"Доступные: standard, minmax, robust, maxabs, normalize, none"
                )
            X_scaled = scaler.fit_transform(X_num)
        else:
            # TEST — используем ��же обученный скейлер
            X_scaled = scaler.transform(X_num)

        scaler_out = scaler

    # Если PCA не нужен — просто возвращаем масштабированные числовые фичи
    if not use_pca:
        df_out[numeric_cols] = X_scaled
        return df_out, scaler_out, None

    # DataFrame после scale (важно: те же numeric_cols, что и в X_num_df)
    X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_cols, index=df_out.index)

    # ---------- ШАГ 2. PCA (TRAIN / TEST) ----------
    if pca is None:
        # TRAIN режим — выбираем признаки для PCA по corr_threshold

        if corr_threshold is None:
            # Все числовые признаки идут в PCA
            pca_input_cols = numeric_cols
        else:
            # Корреляционная матрица по масштабированным признакам
            corr = X_scaled_df.corr().abs()

            # Обнуляем диагональ (корреляция признака с самим собой)
            np.fill_diagonal(corr.values, 0.0)

            # Признак считается "высоко коррелированным", если есть хотя бы один
            # другой признак с |corr| >= corr_threshold
            mask_high_corr = (corr >= corr_threshold).any(axis=1)
            pca_input_cols = corr.index[mask_high_corr].tolist()

        # Если ни один признак не попал под критерий — просто вернём масштабированные признаки
        if len(pca_input_cols) == 0:
            df_out[numeric_cols] = X_scaled
            return df_out, scaler_out, None

        # Обучаем PCA на выбранных признаках
        pca = PCA(n_components=explained_variance)
        X_pca = pca.fit_transform(X_scaled_df[pca_input_cols])

        # Сохраняем список входных признаков внутрь PCA,
        # чтобы на тесте использовать точно тот же набор
        pca.pca_input_features_ = list(pca_input_cols)

    else:
        # TEST режим — используем уже обученный PCA
        if not hasattr(pca, "pca_input_features_"):
            raise AttributeError(
                "Переданный PCA не содержит атрибута 'pca_input_features_'. "
                "Убедитесь, что он был обучен через эту же функцию"
            )

        pca_input_cols = list(pca.pca_input_features_)

        # Проверяем, что все нужные признаки есть в текущих данных
        missing = set(pca_input_cols) - set(numeric_cols)
        if missing:
            raise ValueError(
                f"В данных нет признаков, использовавшихся для PCA при обучении: {missing}"
            )

        # Применяем PCA к тем же самым колонкам
        X_pca = pca.transform(X_scaled_df[pca_input_cols])

    pca_out = pca

    # ---------- ШАГ 3. Собираем итоговый DataFrame ----------
    n_components = X_pca.shape[1]
    pca_cols = [f"PC{i+1}" for i in range(n_components)]
    df_pca = pd.DataFrame(X_pca, columns=pca_cols, index=df_out.index)

    # Числовые признаки, которые не пошли в PCA
    numeric_rest = [c for c in numeric_cols if c not in pca_input_cols]

    # Нечисловые столбцы
    df_non_num = df_out.drop(columns=numeric_cols)

    # Итоговый датафрейм:
    #   нечисловые + числовые без PCA (масштабированные) + PCA-компоненты
    df_out = pd.concat(
        [df_non_num, X_scaled_df[numeric_rest], df_pca],
        axis=1
    )

    return df_out, scaler_out, pca_out


In [115]:
X_train

1577
1893
895
1479
1857
...
256
1799
831
387
1944


In [116]:
X_test

429
917
1842
204
1649
...
343
1400
1676
1580
13


In [117]:
# TRAIN
X_train_pca, scaler, pca = apply_scaler_pca(
    X_train,
    scaler_name="standard",
    use_pca=True,
    explained_variance=0.9,      # сколько дисперсии объяснять
    corr_threshold=0.75,         # какие признаки считать "высоко коррелированными"
)

# TEST — те же scaler и pca
X_test_pca, _, _ = apply_scaler_pca(
    X_test,
    scaler_name="standard",
    use_pca=True,
    scaler=scaler,
    pca=pca,
)
# FULL TEST — те же scaler и pca
orig_test_pca, _, _ = apply_scaler_pca(
    full_test_df_processed_100,
    scaler_name="standard",
    use_pca=True,
    scaler=scaler,
    pca=pca,
)

In [118]:
X_train_pca

1577
1893
895
1479
1857
...
256
1799
831
387
1944


In [119]:
X_test_pca

429
917
1842
204
1649
...
343
1400
1676
1580
13


In [120]:
orig_test_pca

0
1
2
3
4
...
883
884
885
886
887


In [123]:
import joblib
import json
import os

os.makedirs("artifacts", exist_ok=True)

if 'scaler' in globals() and scaler is not None:
    joblib.dump(scaler, "artifacts/scaler_fp.joblib")

if 'pca' in globals() and pca is not None:
    joblib.dump(pca, "artifacts/pca_fp.joblib")

if 'X_train' in globals():
    numeric_cols = X_train.select_dtypes(include=["int64", "float64"]).columns.tolist()
else:
    numeric_cols = []
with open("artifacts/numeric_cols_fp.json", "w", encoding="utf-8") as f:
    json.dump(numeric_cols, f, indent=2, ensure_ascii=False)

n_comp = None
if 'pca' in globals() and pca is not None:
    n_comp = getattr(pca, 'n_components_', None)
if n_comp is None:
    pca_cols = []
else:
    pca_cols = [f"PC{i+1}" for i in range(n_comp)]
with open("artifacts/pca_cols_fp.json", "w", encoding="utf-8") as f:
    json.dump(pca_cols, f, indent=2, ensure_ascii=False)

### Сохраняем все для дальнейшего применения

In [124]:
X_train_pca.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_train_fp.csv'), index=False)
X_test_pca.to_csv(os.path.join(PROCESSED_DATA_PATH, 'X_test_fp.csv'), index=False)
y_train.to_csv(os.path.join(PROCESSED_DATA_PATH, 'y_train_fp.csv'), index=False)
y_test.to_csv(os.path.join(PROCESSED_DATA_PATH, 'y_test_fp.csv'), index=False)
orig_test_pca.to_csv(os.path.join(PROCESSED_DATA_PATH, 'orig_test_fp.csv'), index=False)