# Working with data

## Data load

In [1]:
def tr_datetime_preprocess(tr_datetime):
    days, hms = tr_datetime.split()
    hh, mm, ss = hms.split(":")

    seconds = timedelta(hours=int(hh), minutes=int(mm), seconds=int(ss))
    seconds = seconds.total_seconds()
    seconds += int(days) * 24 * 3600

    return int(seconds)

In [2]:
import pandas as pd
import numpy as np
from datetime import timedelta
from ptls.preprocessing import PandasDataPreprocessor
import torch
from sklearn.model_selection import train_test_split

transactions = pd.read_csv("https://huggingface.co/datasets/dllllb/transactions-gender/resolve/main/transactions.csv.gz?download=true", compression="gzip")
targets = pd.read_csv("https://huggingface.co/datasets/dllllb/transactions-gender/resolve/main/gender_train.csv?download=true")
transactions = transactions.dropna().reset_index(drop=True)
transactions
n_cutomers = len(pd.unique(transactions["customer_id"]))
n_labeling_cutomers = len(pd.unique(targets["customer_id"]))
sourceA = transactions[["customer_id", "tr_datetime", "mcc_code", "term_id"]]
sourceB = transactions[["customer_id", "tr_datetime", "tr_type", "amount"]]
sourceA_drop_indices = np.random.choice(sourceA.index, 130000, replace=False)
sourceB_drop_indices = np.random.choice(sourceB.index, 420000, replace=False)

sourceA = sourceA.drop(sourceA_drop_indices).reset_index(drop=True)
sourceB = sourceB.drop(sourceB_drop_indices).reset_index(drop=True)

mcc_code_in = len(np.unique((sourceA["mcc_code"])))
term_id_in = len(np.unique((sourceA["term_id"])))
tr_type_in = len(np.unique((sourceB["tr_type"])))

print("mcc_code_in:", mcc_code_in)
print("term_id_in:", term_id_in)
print("tr_type_in", tr_type_in)

sourceA["tr_datetime"] = sourceA["tr_datetime"].apply(tr_datetime_preprocess)
sourceB["tr_datetime"] = sourceB["tr_datetime"].apply(tr_datetime_preprocess)

sourceA_preprocessor = PandasDataPreprocessor(
    col_id="customer_id",
    col_event_time="tr_datetime",
    event_time_transformation="none",
    cols_category=["mcc_code", "term_id"],
    return_records=False,
)

sourceB_preprocessor = PandasDataPreprocessor(
    col_id="customer_id",
    col_event_time="tr_datetime",
    event_time_transformation="none",
    cols_numerical=["tr_type", "amount"],
    return_records=False,
)

processed_sourceA = sourceA_preprocessor.fit_transform(sourceA)
processed_sourceB = sourceB_preprocessor.fit_transform(sourceB)

processed_sourceA.columns = [
    "sourceA_" + str(col) if str(col) != "customer_id" else str(col)
    for col in processed_sourceA.columns
]

processed_sourceB.columns = [
    "sourceB_" + str(col) if str(col) != "customer_id" else str(col)
    for col in processed_sourceB.columns
]
joined_data = processed_sourceA.merge(processed_sourceB, how="outer", on="customer_id")
joined_data = joined_data.applymap(lambda x: torch.tensor([]) if pd.isna(x) else x)
train_df, test_df = train_test_split(joined_data,
                                     test_size=0.1,
                                     random_state=42)
train_df, valid_df = train_test_split(train_df,
                                      test_size=0.1,
                                      random_state=42)
print(train_df.index.intersection(test_df.index))
print(train_df['customer_id'].unique().shape, 
          test_df['customer_id'].unique().shape)
print(np.unique(test_df.index.values).shape, test_df.shape)
print(test_df.index)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_dict = train_df.to_dict("records")
valid_dict = valid_df.to_dict("records")
test_dict = test_df.to_dict("records")
source_features = {
    "sourceA": ["event_time", "mcc_code", "term_id"],
    "sourceB": ["event_time", "tr_type", "amount"]
}

mcc_code_in: 184
term_id_in: 435186
tr_type_in 73
Index([], dtype='int64')
(12126,) (1498,)
(1498,) (1498, 9)
Index([11668,  9806,  9868,  5598,  7091,  6558,  9940,  3107, 11498,  1056,
       ...
       11258, 11788,  3689, 11292,  9088,  9578,   388,  6800, 13811, 10999],
      dtype='int64', length=1498)


  joined_data = joined_data.applymap(lambda x: torch.tensor([]) if pd.isna(x) else x)


In [3]:
import sys
sys.path.append("google-research/graph_embedding/metrics")

In [5]:
import os

checkpoints_path = "gender/checkpoints1"
os.makedirs(checkpoints_path, exist_ok=True)

In [6]:
import logging
os.makedirs('logs/gender1', exist_ok=True)

logger = logging.getLogger("my_logger")
logger.setLevel(logging.INFO)

file_handler = logging.FileHandler("logs/gender/fraction_experiment.log")
formatter = logging.Formatter('%(asctime)s [%(levelname)s] %(message)s')
file_handler.setFormatter(formatter)

# Удалим другие обработчики
if logger.hasHandlers():
    logger.handlers.clear()

logger.addHandler(file_handler)
logger.info("🔧 Логгер настроен вручную")

In [11]:
fixed_params = {
    "batch_size": 64,
    "learning_rate": 0.001,
    "split_count": 3,
    "cnt_min": 10,
    "cnt_max": 50,
    "embedding_dim": 16,  # Размерность эмбеддингов
    "category_embedding_dim": 8,  # Размерность категорий эмбеддингов
    "hidden_size": 128,  # Размер скрытого слоя по умолчанию
}

#Список гиперпараметров для перебора
variable_params = {
    "batch_size": [32, 64, 128], 
    "learning_rate": [0.001, 0.005, 0.01, 0.05],
    "split_count": [3, 5, 7],
    "cnt_min": [10, 15, 20],
    "cnt_max": [80, 100, 150],
    "embedding_dim": [8, 16, 24, 32],
    "category_embedding_dim": [8, 16, 24],
    "hidden_size": [64, 128, 256],
}
# variable_params = {
#      "batch_size": [32]
# }

In [86]:
# fixed_params = {
#     "batch_size": 64,
#     "learning_rate": 0.001,
#     "split_count": 3,
#     "cnt_min": 10,
#     "cnt_max": 50,
#     "embedding_dim": 16,  
#     "category_embedding_dim": 8,  
# }

# # Перебираемые значения hidden_size
# hidden_sizes = [64, 256, 512, 1024, 1468, 2048]  # От маленьких значений до очень больших

# # Формирование гиперпараметров для перебора
# all_hyperparameter_grids = [
#     {**fixed_params, "hidden_size": h_size} for h_size in hidden_sizes
# ]

In [12]:
%load_ext autoreload
%autoreload 2
from run_exp import create_params_grid, run_grid_search
import run_metrics, run_models
all_hyperparameter_grids = create_params_grid(fixed_params, variable_params)
out_prefix = r"csv_results/gender_sample_fraction1"
sample_fractions = np.linspace(1/20, 1, 5)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
run_grid_search(all_hyperparameter_grids, sample_fractions,
        train_dict, valid_dict, test_dict, targets,
        checkpoints_path,
        source_features, logger, mcc_code_in, term_id_in, tr_type_in, 
                num_epochs=30, col_id="customer_id", 
        target_col='gender', out_prefix=out_prefix)

Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/home/dpetrovitch/venv/lib/python3.12/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /home/dpetrovitch/experiments/gender/checkpoints1 exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type                                  | Params | Mode 
-------------------------------------------------------------------------------------
0 | _loss              | ContrastiveLoss                       | 0      | train
1 | _seq_encoder       | MultiModalSortTimeSeqEncoderContainer | 14.0 M | train
2 | _validation_metric | BatchRecallTopK                       | 0      | train
3 | _head              | Head                                  | 0      | train
-------------------------------------------------------------------------------------
14.0 M    Trainable

Sanity Checking: |                                                               | 0/? [00:00<?, ?it/s]

/home/dpetrovitch/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.
/home/dpetrovitch/venv/lib/python3.12/site-packages/pytorch_lightning/utilities/data.py:123: Your `IterableDataset` has `__len__` defined. In combination with multi-process data loading (when num_workers > 1), `__len__` could be inaccurate if each worker is not configured independently to avoid having duplicate data.
/home/dpetrovitch/venv/lib/python3.12/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=31` in the `DataLoader` to improve performance.


Training: |                                                                      | 0/? [00:00<?, ?it/s]

Validation: |                                                                    | 0/? [00:00<?, ?it/s]

Metric valid/recall_top_k improved. New best score: 0.301


Validation: |                                                                    | 0/? [00:00<?, ?it/s]

Metric valid/recall_top_k improved by 0.030 >= min_delta = 0.0. New best score: 0.331


In [None]:

    cur_time = time()
    model_keeper = ModelKeeper()    
    all_embs = []    

    for param in all_hyperparameter_grids:            
        logger.info(f'All params are frozen except {param[0]}')
        params = param[1]
        
        logger.info(f"Testing parameters: {params}")
        model_keeper.create_datasets(train_data_in, valid_data_in, params, 
                            source_features, col_id=col_id)
        model_keeper.train_model(params, checkpoints_path=checkpoints_path)
       
        embs = model_keeper.calc_embs_from_trained(test_data_in, model_out_name="emb")
        all_embs += embs

    eval_many_embs(all_embs, targets)

In [None]:
cur_time = time()

for sample_fraction in np.linspace(1/20, 1, 5):
    print(f"{sample_fraction:.2f}")
    logger.info(f"{sample_fraction=:.2f}")
    output_csv = f"csv_results/gender_sample_fraction_{sample_fraction:.3f}".rstrip('0').rstrip('.') + ".csv"


    for param in all_hyperparameter_grids:
        
        logger.info(f'All params are frozen except {param[0]}')
        params = param[1]
        
        logger.info(f"Testing parameters: {params}")

        train_loader = create_datasets(train_dict, valid_dict, params, source_features)

        sourceA_encoder_params = dict(
            embeddings_noise=0.003,
            linear_projection_size=64,
            embeddings={
                "mcc_code": {"in": mcc_code_in, "out": 32},
                "term_id": {"in": term_id_in, "out": 32},
            },
        )
        
        sourceB_encoder_params = dict(
            embeddings_noise=0.003,
            linear_projection_size=64,
            embeddings={
                "tr_type": {"in": tr_type_in, "out": 32},
            },
            numeric_values={"amount": "identity"},
        )
        
        sourceA_encoder = TrxEncoder(**sourceA_encoder_params)
        sourceB_encoder = TrxEncoder(**sourceB_encoder_params)
        
        seq_encoder = MultiModalSortTimeSeqEncoderContainer(
            trx_encoders={
                "sourceA": sourceA_encoder,
                "sourceB": sourceB_encoder,
            },
            input_size=64,
            hidden_size=params["hidden_size"],  # Используем только текущее значение hidden_size
            seq_encoder_cls=RnnEncoder,
            type="gru",
        )

        model = CoLESModule(
            seq_encoder=seq_encoder,
            optimizer_partial=partial(torch.optim.Adam, lr=params["learning_rate"]),
            lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.5),
        )

        early_stopping_callback = EarlyStopping(
            monitor="loss",
            patience=5,
            mode="min",
            verbose=True
        )

        checkpoint_callback = ModelCheckpoint(
            dirpath=checkpoints_path,
            filename=f"model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}{{epoch:02d}}",
            save_top_k=-1,
            every_n_epochs=1,
        )

        # Обучение модели
        pl_trainer = pl.Trainer(
            callbacks=[checkpoint_callback, early_stopping_callback, custom_logger],
            default_root_dir=checkpoints_path,
            check_val_every_n_epoch=1,
            max_epochs=num_epochs,
            accelerator="gpu",
            devices=1,
            enable_progress_bar=True,
            precision=16
        )
        model.train()
        pl_trainer.fit(model, train_loader)

        early_stop_epoch = custom_logger.early_stopping_epoch
        if early_stop_epoch is None:
            early_stop_epoch = num_epochs

        # Обработка чекпоинтов
        checkpoint_files = glob.glob(f"{checkpoints_path}/model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}*.ckpt")
        checkpoint_files.sort()
        logger.info(f"Elapsed time: {time() - cur_time:.2f} seconds")

        logger.info(f'Early stop is {early_stop_epoch}')

        for i, checkpoint in enumerate(checkpoint_files):
            logger.info(f"Processing checkpoint number {i}")
            model = CoLESModule.load_from_checkpoint(checkpoint, seq_encoder=seq_encoder)

            # Вычисление метрик и времени
            metrics, times, accuracy = evaluate_model(model, pl_trainer, inf_test_loader, sample_fraction=sample_fraction)
            metrics_flattened = {f"metric_{k}": round(v, 4) for k, v in metrics.items()}
            times_flattened = {f"time_{k}": round(v, 4) for k, v in times.items()}

            # Сохранение результатов
            new_result = {
                **params,
                "checkpoint": checkpoint,
                "epoch_num": int(i),
                "accuracy": accuracy,
                **metrics_flattened,
                **times_flattened,
                "early_stop_epoch": int(early_stop_epoch),
                "sample_fraction": sample_fraction
            }

            # Сохранение в CSV
            results = pd.DataFrame([new_result], columns=columns)

            if not os.path.exists(output_csv):  
                pd.DataFrame(columns=columns).to_csv(output_csv, mode="w", index=False, header=True)
            
            results.to_csv(output_csv, mode="a", header=False, index=False)

            del metrics, accuracy, new_result
            torch.cuda.empty_cache()
            gc.collect()

        for checkpoint in checkpoint_files:
            os.remove(checkpoint)

        del model
        del train_loader
        torch.cuda.empty_cache()
        gc.collect()
    logger.info(f"results for {sample_fraction=} complete")

print("Optimization complete!")

0.05


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type                                  | Params | Mode 
-------------------------------------------------------------------------------------
0 | _loss              | ContrastiveLoss                       | 0      | train
1 | _seq_encoder       | MultiModalSortTimeSeqEncoderContainer | 14.0 M | train
2 | _validation_metric | BatchRecallTopK                       | 0      | train
3 | _head              | Head                                  | 0      | train
-------------------------------------------------------------------------------------
14.0 M    Trainable params
0         Non-trainable params
14.0 M    Total params
56.048    Total estimated model params size (MB)
27        Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                                                       | 0/?…

Training: |                                                                                              | 0/?…

Validation: |                                                                                            | 0/?…

Metric loss improved. New best score: 17.697


Validation: |                                                                                            | 0/?…

Metric loss improved by 0.888 >= min_delta = 0.0. New best score: 16.808


Validation: |                                                                                            | 0/?…

Metric loss improved by 2.969 >= min_delta = 0.0. New best score: 13.839


Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Monitored metric loss did not improve in the last 5 records. Best score: 13.839. Signaling Trainer to stop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 31.3527 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0039 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.5649 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.2144 | ⏱ 0.0000 сек
🧠 ne_sum                         = 7.5380 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.6821 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 126.4459 | ⏱ 0.0122 сек
🧠 ripser_sum_H0                  = 124.1731 | ⏱ 0.0122 сек
🧠 ripser_sum_H1                  = 2.2728 | ⏱ 0.0122 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 31.7480 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0039 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.4903 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1966 | ⏱ 0.0000 сек
🧠 ne_sum                         = 8.1196 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.6993 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 153.3335 | ⏱ 0.0110 сек
🧠 ripser_sum_H0                  = 150.3634 | ⏱ 0.0110 сек
🧠 ripser_sum_H1                  = 2.9701 | ⏱ 0.0110 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 32.9319 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0041 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.3856 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1977 | ⏱ 0.0000 сек
🧠 ne_sum                         = 9.0111 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.6955 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 175.2336 | ⏱ 0.0105 сек
🧠 ripser_sum_H0                  = 171.5645 | ⏱ 0.0105 сек
🧠 ripser_sum_H1                  = 3.6690 | ⏱ 0.0105 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 33.4202 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0045 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.2757 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1920 | ⏱ 0.0000 сек
🧠 ne_sum                         = 9.1144 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.7014 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 183.6301 | ⏱ 0.0123 сек
🧠 ripser_sum_H0                  = 180.0147 | ⏱ 0.0123 сек
🧠 ripser_sum_H1                  = 3.6153 | ⏱ 0.0123 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 33.9344 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0044 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.1619 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1849 | ⏱ 0.0000 сек
🧠 ne_sum                         = 9.3910 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.7097 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 192.5663 | ⏱ 0.0109 сек
🧠 ripser_sum_H0                  = 188.4109 | ⏱ 0.0109 сек
🧠 ripser_sum_H1                  = 4.1554 | ⏱ 0.0109 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 35.3055 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0050 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.0453 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1962 | ⏱ 0.0000 сек
🧠 ne_sum                         = 9.2820 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.6959 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 200.5509 | ⏱ 0.0127 сек
🧠 ripser_sum_H0                  = 196.5061 | ⏱ 0.0127 сек
🧠 ripser_sum_H1                  = 4.0449 | ⏱ 0.0127 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 34.5874 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0048 | ⏱ 0.0000 сек
🧠 alpha_req                      = 3.0329 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1816 | ⏱ 0.0000 сек
🧠 ne_sum                         = 9.0161 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.7130 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 203.1643 | ⏱ 0.0114 сек
🧠 ripser_sum_H0                  = 199.1070 | ⏱ 0.0114 сек
🧠 ripser_sum_H1                  = 4.0572 | ⏱ 0.0114 сек


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]


Predicting: |                                                                                            | 0/?…


📊 Средние значения метрик и время вычисления:
🧠 rankme                         = 35.5268 | ⏱ 0.0001 сек
🧠 coherence                      = 1.0000 | ⏱ 0.0000 сек
🧠 pseudo_condition_number        = 0.0051 | ⏱ 0.0000 сек
🧠 alpha_req                      = 2.9750 | ⏱ 0.0001 сек
🧠 stable_rank                    = 1.1939 | ⏱ 0.0000 сек
🧠 ne_sum                         = 8.8178 | ⏱ 0.0008 сек
🧠 self_clustering                = 0.6983 | ⏱ 0.0001 сек
🧠 ripser                         = nan | ⏱ nan сек
🧠 ripser_sum                     = 211.2965 | ⏱ 0.0155 сек
🧠 ripser_sum_H0                  = 206.7317 | ⏱ 0.0155 сек
🧠 ripser_sum_H1                  = 4.5648 | ⏱ 0.0155 сек


Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name               | Type                                  | Params | Mode 
-------------------------------------------------------------------------------------
0 | _loss              | ContrastiveLoss                       | 0      | train
1 | _seq_encoder       | MultiModalSortTimeSeqEncoderContainer | 14.0 M | train
2 | _validation_metric | BatchRecallTopK                       | 0      | train
3 | _head              | Head                                  | 0      | train
-------------------------------------------------------------------------------------
14.0 M    Trainable params
0         Non-trainable params
14.0 M    Total params
56.048    Total estimated model params size (MB)
27        Modules in train mode
0         Modules in eval mode


Sanity Checking: |                                                                                       | 0/?…

Training: |                                                                                              | 0/?…

Validation: |                                                                                            | 0/?…

Metric loss improved. New best score: 19.845


Validation: |                                                                                            | 0/?…

Metric loss improved by 0.569 >= min_delta = 0.0. New best score: 19.276


Validation: |                                                                                            | 0/?…

Metric loss improved by 3.807 >= min_delta = 0.0. New best score: 15.469


Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Metric loss improved by 0.669 >= min_delta = 0.0. New best score: 14.800


Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Metric loss improved by 0.494 >= min_delta = 0.0. New best score: 14.306


Validation: |                                                                                            | 0/?…

Metric loss improved by 0.355 >= min_delta = 0.0. New best score: 13.950


Validation: |                                                                                            | 0/?…

Metric loss improved by 0.178 >= min_delta = 0.0. New best score: 13.773


Validation: |                                                                                            | 0/?…

Validation: |                                                                                            | 0/?…

Validation: |          | 0/? [00:00<?, ?it/s]

## Try Optuna and recall TopK

In [None]:
# import optuna
# import pandas as pd
# import torch
# import gc
# from tqdm import tqdm

# # Файл для сохранения результатов
# output_csv = "optuna_hyperparameter_by_topK.csv"

# # Определение диапазонов гиперпараметров
# def define_search_space(trial):
#     return {
#         "batch_size": trial.suggest_categorical("batch_size", [32, 64]),
#         "learning_rate": trial.suggest_loguniform("learning_rate", 1e-4, 5e-2),
#         "hidden_size": trial.suggest_categorical("hidden_size", [64, 128]),
#         "embedding_dim": trial.suggest_categorical("embedding_dim", [8, 16, 32]),
#         "category_embedding_dim": trial.suggest_categorical("category_embedding_dim", [4, 8, 16]),
#         "split_count": trial.suggest_categorical("split_count", [2, 3, 5]),
#         "cnt_min": trial.suggest_categorical("cnt_min", [5, 10, 20]),
#         "cnt_max": trial.suggest_categorical("cnt_max", [50, 80, 100]),
#     }

# # Метрики, по которым проводится подбор гиперпараметров
# metric_names = [
#     "rankme", "coherence", "pseudo_condition_number",
#     "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
# ]

# # Список для хранения всех результатов
# optuna_results = []

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [None]:
# from ptls.data_load.padded_batch import PaddedBatch


# class CustomCoLESModule(CoLESModule):
#     def __init__(self, custom_metric_name, *args, **kwargs):
#         super().__init__(*args, **kwargs)
#         self.custom_metric_name = custom_metric_name
#         # model.to(device)


#     def validation_step(self, batch, batch_idx):
#         print("valedation step")
#         x, y = batch
#         y = y.to(self.device)
#         for key in x:
#             if isinstance(x[key], PaddedBatch):
#                 # 🔄 Создаем новый PaddedBatch с seq_lens и payload на нужном устройстве
#                 x[key] = PaddedBatch(
#                     payload={k: v.to(self.device) for k, v in x[key].payload.items()},
#                     length=x[key].length.to(self.device)  # ⚠️ Используем length вместо seq_lens
#                 )
#             else:
#                 print(f"⚠️ [WARNING] Expected PaddedBatch but got {type(x[key])} for {key}")

#         print(f"Model is on device: {next(self.parameters()).device}")
#         print(f"x is on device: {[x[k].device for k in x]}")
#         print(f"y is on device: {y.device}")

#         y_hat = self(x)

#         # Compute loss (assuming classification task)
#         loss = torch.nn.functional.cross_entropy(y_hat, y)

#         metric_value, _, _ = compute_metrics(model, pl_trainer, inf_test_loader, selected_metrics=[self.custom_metric_name])

#         print(f"[DEBUG] Logging metric: valid/{self.custom_metric_name} = {metric_value[self.custom_metric_name]}")

#         self.trainer.logger.log_metrics({f"valid/{self.custom_metric_name}": metric_value[self.custom_metric_name]}, step=self.current_epoch)

#         return {"loss": loss, self.custom_metric_name: metric_value[self.custom_metric_name]}

In [None]:
! rm -rf /kaggle/working/checkpoints

In [None]:
! rm /kaggle/working/optuna_best_trials_accuracy.csv

In [None]:
# import optuna
# import time
# import pandas as pd
# import torch
# import gc
# import os
# import glob
# from functools import partial
# from pytorch_lightning import Trainer
# from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
# from tqdm import tqdm
# from time import time

# # Файл для логирования результатов
# output_csv = "optuna_results.csv"

# # Метрики для проверки
# metric_names = [
#     "rankme", "coherence", "pseudo_condition_number",
#     "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
# ]

# optuna_columns = [
#     *fixed_params.keys(), "checkpoint", "epoch_num", "accuracy", "topk_accuracy", "early_stop_epoch", "hidden_size",
# ] + [
#     "metric_" + key for key in [
#         "rankme", "coherence", "pseudo_condition_number", 
#         "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
#     ]
# ] + ["time_" + key for key in [
#     "rankme", "coherence", "pseudo_condition_number", 
#     "alpha_req", "stable_rank", "ne_sum", "self_clustering", "ripser"
# ]]


# def objective(trial):
#     # print(f'dealing with metric {metric_name}')
#     torch.cuda.empty_cache()
#     gc.collect()

#     params = define_search_space(trial)

#     # === Dataset ===
#     data_module = create_datasets(train_dict, valid_dict, params, source_features)

#     # === Encoders ===
#     sourceA_encoder = TrxEncoder(
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#         embeddings={
#             "mcc_code": {"in": mcc_code_in, "out": 32},
#             "term_id": {"in": term_id_in, "out": 32},
#         },
#     )

#     sourceB_encoder = TrxEncoder(
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#         embeddings={
#             "tr_type": {"in": tr_type_in, "out": 32},
#         },
#         numeric_values={"amount": "identity"},
#     )

#     seq_encoder = MultiModalSortTimeSeqEncoderContainer(
#         trx_encoders={"sourceA": sourceA_encoder, "sourceB": sourceB_encoder},
#         input_size=64,
#         hidden_size=params["hidden_size"],
#         seq_encoder_cls=RnnEncoder,
#         type="gru",
#     )

#     model = CoLESModule(
#         seq_encoder=seq_encoder,
#         optimizer_partial=partial(torch.optim.Adam, lr=params["learning_rate"]),
#         lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.5),
#     )

#     # === Callbacks ===
#     early_stopping_callback = EarlyStopping(
#         monitor=f"valid/recall_top_k", patience=5, mode="max", verbose=True
#     )

#     checkpoint_callback = ModelCheckpoint(
#         dirpath=checkpoints_path,
#         filename=f"model_optuna_trial_{trial.number}_epoch={{epoch:02d}}",
#         save_top_k=-1,
#         monitor="valid/recall_top_k",
#         mode="max",
#     )

#     trainer = pl.Trainer(
#         callbacks=[checkpoint_callback, early_stopping_callback, custom_logger],
#         default_root_dir=checkpoints_path,
#         check_val_every_n_epoch=1,
#         max_epochs= 1, # num_epochs,
#         accelerator="gpu",
#         devices=1,
#         enable_progress_bar=True,
#         precision=16,
#     )

#     trainer.fit(model, datamodule=data_module)

#     early_stop_epoch = custom_logger.early_stopping_epoch or num_epochs

#     # === Evaluate checkpoints ===
#     checkpoint_files = sorted(
#         glob.glob(f"{checkpoints_path}/model_optuna_trial_{trial.number}_epoch=*.ckpt")
#     )

#     best_acc = float("-inf")

#     for i, checkpoint in enumerate(checkpoint_files):
#         model = CoLESModule.load_from_checkpoint(checkpoint, seq_encoder=seq_encoder)
#         metrics, times, acc = evaluate_model(model, trainer, inf_train_loader)

#         metrics_flattened = {f"metric_{k}": round(v, 4) for k, v in metrics.items()}
#         times_flattened = {f"time_{k}": round(v, 4) for k, v in times.items()}
#         trainer = pl.Trainer(accelerator="gpu", devices=1)

#         val_metrics = trainer.validate(model=model, datamodule=data_module)
#         recall_top_k = val_metrics[0].get("valid/recall_top_k", None)
#         result = {
#             **params,
#             "checkpoint": checkpoint,
#             "epoch_num": i,
#             "accuracy": acc,
#             "topk_accuracy": recall_top_k,
#             **metrics_flattened,
#             **times_flattened,
#             "early_stop_epoch": early_stop_epoch,
#         }

#         results_df = pd.DataFrame([result], columns=optuna_columns)

#         if not os.path.exists(output_csv):
#             pd.DataFrame(columns=optuna_columns).to_csv(output_csv, index=False, header=True)
#         results_df.to_csv(output_csv, mode="a", header=False, index=False)

#         # current_metric_value = metrics.get(metric_name, float("-inf"))
#         best_acc = max(best_acc, acc)

#         del model, result, metrics
#         torch.cuda.empty_cache()
#         gc.collect()

#     for ckpt in checkpoint_files:
#         os.remove(ckpt)

#     return best_acc


In [None]:
# import optuna
# import pandas as pd
# import os
# from time import time
# from functools import partial

# # Параметры для Optuna
# num_trials = 10
# cur_time = time()

# # Путь к файлу для сохранения лучших результатов
# best_trials_csv = "optuna_best_trials_accuracy.csv"

# # Если файла ещё нет, создаём его
# if not os.path.exists(best_trials_csv):
#     pd.DataFrame(columns=["value", *fixed_params.keys()]).to_csv(best_trials_csv, index=False)

# # Теперь оптимизируем только по accuracy
# study = optuna.create_study(direction="maximize")  # Оптимизируем именно accuracy!
# study.optimize(objective, n_trials=num_trials)     # objective теперь должна возвращать 1-accuracy или -accuracy

# # Достаем лучший результат
# best_trial = study.best_trial
# best_result = {
#     "value": best_trial.value,
#     **best_trial.params
# }

# # Сохраняем лучший результат
# df_best = pd.DataFrame([best_result])
# df_best.to_csv(best_trials_csv, mode="a", header=False, index=False)

# # Логи
# print(f"✅ Optimization completed (direction: maximize)")
# print(f"⏱️ Time passed: {time() - cur_time:.2f} sec")
# print(f"🥇 Best trial value: {best_trial.value}")
# print(f"📊 Params: {best_trial.params}")


## Eval model with best hyperparams

In [None]:
input_csv = "/kaggle/input/gender-tr-best-params/gender_tr_optuna_best_params.csv"
best_trials_df = pd.read_csv(input_csv)

In [None]:
best_trials_df.reset_index(inplace=True)
best_trials_df.rename(columns={"index": "metric", "metric":"value",
                              "value":"batch_size", "batch_size":"learning_rate", "learning_rate":"hidden_size"}, inplace=True)

In [None]:
best_trials_df.rename(columns={"cnt_min": "embedding_dim", "embedding_dim":"cnt_min",
                              "category_embedding_dim":"cnt_max", "cnt_max":"category_embedding_dim"}, inplace=True)

In [None]:
best_trials_df

In [None]:
! rm -rf /kaggle/working/checkpoints

In [None]:
! rm /kaggle/working/optuna_best_metrics_eval.csv

In [None]:
# import pandas as pd
# import torch
# import gc
# import os
# import glob
# from functools import partial
# from time import time
# from pytorch_lightning import Trainer
# from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
# from itertools import islice



# checkpoints_path = "checkpoints"
# os.makedirs(checkpoints_path, exist_ok=True)


# # Для хранения результатов
# columns = list(best_trials_df.columns) + [
#     "checkpoint", "epoch_num", "accuracy", "early_stop_epoch"
# ] + [f"metric_{m}" for m in metric_names] + [f"time_{m}" for m in metric_names]

# for idx, row in islice(best_trials_df.iterrows(), 2, None):
#     metric_name = row["metric"]
#     print(f"\n=== Processing best params for metric: {metric_name} ===")
#     output_csv = f"optuna_best_metrics_eval_{metric_name}.csv"

#     print(row)
    
#     # Собираем параметры
#     params = {
#         "batch_size": int(row["batch_size"]),
#         "learning_rate": float(row["learning_rate"]),
#         "split_count": int(row["split_count"]),
#         "cnt_min": int(row["cnt_min"]),
#         "cnt_max": int(row["cnt_max"]),
#         "embedding_dim": int(row["embedding_dim"]),
#         "category_embedding_dim": int(row["category_embedding_dim"]),
#         "hidden_size": int(row["hidden_size"]),  # добавь сюда если будет в CSV
#     }

#     # Загружаем датасет
#     train_loader = create_datasets(train_dict, valid_dict, params, source_features)

#     # Создаём энкодеры
#     sourceA_encoder = TrxEncoder(
#         embeddings={"mcc_code": {"in": mcc_code_in, "out": 32}, "term_id": {"in": term_id_in, "out": 32}},
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#     )
#     sourceB_encoder = TrxEncoder(
#         embeddings={"tr_type": {"in": tr_type_in, "out": 32}},
#         numeric_values={"amount": "identity"},
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#     )

#     seq_encoder = MultiModalSortTimeSeqEncoderContainer(
#         trx_encoders={"sourceA": sourceA_encoder, "sourceB": sourceB_encoder},
#         input_size=64,
#         hidden_size=params["hidden_size"],
#         seq_encoder_cls=RnnEncoder,
#         type="gru",
#     )

#     model = CoLESModule(
#         seq_encoder=seq_encoder,
#         optimizer_partial=partial(torch.optim.Adam, lr=params["learning_rate"]),
#         lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.5),
#     )

#     early_stopping_callback = EarlyStopping(
#         monitor="loss", patience=5, mode="min", verbose=True
#     )
#     checkpoint_callback = ModelCheckpoint(
#         dirpath=checkpoints_path,
#         filename=f"best_{metric_name}_trial_{idx}_epoch={{epoch:02d}}",
#         save_top_k=-1,
#         every_n_epochs=1,
#     )

#     trainer = Trainer(
#         callbacks=[checkpoint_callback, early_stopping_callback],
#         default_root_dir=checkpoints_path,
#         check_val_every_n_epoch=1,
#         max_epochs=30,
#         accelerator="gpu",
#         devices=1,
#         enable_progress_bar=True,
#         precision=16
#     )

#     # Обучение
#     trainer.fit(model, train_loader)
#     early_stop_epoch = getattr(trainer.logger, "early_stopping_epoch", None) or num_epochs

#     # Вычисление метрик
#     checkpoint_files = sorted(
#         glob.glob(f"{checkpoints_path}/best_{metric_name}_trial_{idx}_epoch=*.ckpt")
#     )
#     model.cpu()
#     del model
#     torch.cuda.empty_cache()

#     for i, checkpoint in enumerate(checkpoint_files):
#         print(f"Evaluating checkpoint #{i}")
#         model = CoLESModule.load_from_checkpoint(checkpoint, seq_encoder=seq_encoder)
#         metrics, times, accuracy = evaluate_model(model, trainer)

#         row_result = {
#             **params,
#             "metric": metric_name,
#             "checkpoint": checkpoint,
#             "epoch_num": i,
#             "accuracy": accuracy,
#             "early_stop_epoch": early_stop_epoch,
#             **{f"metric_{k}": round(v, 4) for k, v in metrics.items()},
#             **{f"time_{k}": round(v, 4) for k, v in times.items()}
#         }

#         # Сохраняем результат
#         result_df = pd.DataFrame([row_result], columns=columns)
#         if not os.path.exists(output_csv):
#             pd.DataFrame(columns=columns).to_csv(output_csv, index=False)
#         result_df.to_csv(output_csv, mode="a", index=False, header=False)

#         del model, result_df
#         torch.cuda.empty_cache()
#         gc.collect()

#     # Удаление чекпоинтов
#     for ckpt in checkpoint_files:
#         os.remove(ckpt)
#     del trainer, train_loader, seq_encoder
#     torch.cuda.empty_cache()
#     gc.collect()

# print("✅ Evaluation of best params complete.")

In [None]:
# import os

# # Загружаем сохраненные результаты, если файл уже существует
# if os.path.exists(output_csv):
#     print('exists')
#     processed_df = pd.read_csv(output_csv)
    
#     processed_params = [
#     tuple(row[["batch_size", "learning_rate", "split_count", "cnt_min", "cnt_max", "embedding_dim", "category_embedding_dim", "hidden_size"]])
#     for _, row in processed_df.iterrows()
# ]

# else:
#     processed_params = set()
#     print("don't exists")

In [None]:
# remaining_hyperparameter_grids = []

# for variable_param_name, hyperparameter_grid in all_hyperparameter_grids:
#     # Преобразуем hyperparameter_grid в кортеж значений (только значимые параметры)
#     param_values_tuple = tuple(hyperparameter_grid.values())  # Без сортировки!
#     if param_values_tuple not in processed_params:
#         print(param_values_tuple)
#         remaining_hyperparameter_grids.append((variable_param_name, hyperparameter_grid))
#     else:
#         print(1)

# print(f"Remaining hyperparameter sets to process: {len(remaining_hyperparameter_grids)}")


In [None]:
# # Запускаем обучение только для оставшихся гиперпараметров
# for variable_param_name, params in remaining_hyperparameter_grids:
#     print(f'variable param is {variable_param_name}')
#     print(f"Processing parameters: {params}")

#     train_loader = create_datasets(train_dict, valid_dict, params, source_features)

#     sourceA_encoder_params = dict(
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#         embeddings={
#             "mcc_code": {"in": mcc_code_in, "out": 32},
#             "term_id": {"in": term_id_in, "out": 32},
#         },
#     )

#     sourceB_encoder_params = dict(
#         embeddings_noise=0.003,
#         linear_projection_size=64,
#         embeddings={
#             "tr_type": {"in": tr_type_in, "out": 32},
#         },
#         numeric_values={"amount": "identity"},
#     )

#     sourceA_encoder = TrxEncoder(**sourceA_encoder_params)
#     sourceB_encoder = TrxEncoder(**sourceB_encoder_params)

#     seq_encoder = MultiModalSortTimeSeqEncoderContainer(
#         trx_encoders={
#             "sourceA": sourceA_encoder,
#             "sourceB": sourceB_encoder,
#         },
#         input_size=64,
#         hidden_size=params["hidden_size"],
#         seq_encoder_cls=RnnEncoder,
#         type="gru",
#     )

#     model = CoLESModule(
#         seq_encoder=seq_encoder,
#         optimizer_partial=partial(torch.optim.Adam, lr=params["learning_rate"]),
#         lr_scheduler_partial=partial(torch.optim.lr_scheduler.StepLR, step_size=10, gamma=0.5),
#     )

#     early_stopping_callback = EarlyStopping(
#         monitor="loss",  # Следим за валидационным лоссом
#         patience=5,  # Количество эпох без улучшения перед остановкой
#         mode="min",  # Нужно минимизировать loss
#         verbose=True
#     )

#     checkpoint_callback = ModelCheckpoint(
#         dirpath=checkpoints_path,
#         filename=f"model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}{{epoch:02d}}",
#         save_top_k=-1,
#         every_n_epochs=1,
#     )

#     # Обучение модели
#     pl_trainer = pl.Trainer(
#         callbacks=[checkpoint_callback, early_stopping_callback, custom_logger],
#         default_root_dir=checkpoints_path,
#         check_val_every_n_epoch=1,
#         max_epochs=num_epochs,
#         accelerator="gpu",
#         devices=1,
#         enable_progress_bar=True,
#         precision=16
#     )
#     model.train()
#     pl_trainer.fit(model, train_loader)

#     early_stop_epoch = custom_logger.early_stopping_epoch
#     if early_stop_epoch is None:
#         early_stop_epoch = num_epochs

#     # Обработка чекпоинтов
#     checkpoint_files = glob.glob(f"{checkpoints_path}/model_{params['batch_size']}_{params['learning_rate']}_{params['split_count']}_{params['cnt_min']}_{params['cnt_max']}_{params['hidden_size']}*.ckpt")
#     checkpoint_files.sort()
#     print(f"Elapsed time: {time() - cur_time:.2f} seconds")

#     print(f'Early stop is {early_stop_epoch}')

#     for i, checkpoint in enumerate(checkpoint_files):
#         print(f"Processing checkpoint number {i}")
#         model = CoLESModule.load_from_checkpoint(checkpoint, seq_encoder=seq_encoder)

#         metrics, times, accuracy = evaluate_model(model)
#         metrics_flattened = {f"metric_{k}": round(v, 4) for k, v in metrics.items()}
#         times_flattened = {f"time_{k}": round(v, 4) for k, v in times.items()}

#         new_result = {
#             **params,
#             "checkpoint": checkpoint,
#             "epoch_num": int(i),
#             "accuracy": accuracy,
#             **metrics_flattened,
#             **times_flattened,
#             "early_stop_epoch": int(early_stop_epoch)
#         }

#         new_result["epoch_num"] = int(new_result["epoch_num"])

#         results = pd.DataFrame([new_result], columns=columns)
#         print('----------')
#         print(results)

#         if not os.path.exists(output_csv):  # Проверяем, существует ли уже файл
#             pd.DataFrame(columns=columns).to_csv(output_csv, mode="w", index=False, header=True)
        
#         results.to_csv(output_csv, mode="a", header=False, index=False)

#         del metrics, accuracy, new_result
#         torch.cuda.empty_cache()
#         gc.collect()

#     print(f"Removing checkpoints for parameters: {params}")
#     for checkpoint in checkpoint_files:
#         os.remove(checkpoint)

#     del model
#     del train_loader
#     torch.cuda.empty_cache()
#     gc.collect()

# print("Optimization complete!")

In [None]:
# df_results = pd.DataFrame(results)
# df_results.to_csv("CoLES_hyperparameter_results.csv", index=False)
# print("Results saved to hyperparameter_results.csv")

### Calc correlation

In [None]:
hyperparameters = ["batch_size", "learning_rate", "split_count", "cnt_min", "cnt_max"]

# Корреляция между RankMe и accuracy
rankme_accuracy_corr, rankme_accuracy_pval = stats.pearsonr(df_results["rankme"], df_results["accuracy"])
print(f"Correlation between RankMe and Accuracy: {rankme_accuracy_corr:.4f}")
print(f"P-value: {rankme_accuracy_pval:.4e}\n")

# Корреляция между RankMe и каждым гиперпараметром
for param in hyperparameters:
    corr, pval = stats.pearsonr(df_results["rankme"], df_results[param])
    print(f"Correlation between RankMe and {param}: {corr:.4f}")
    print(f"P-value: {pval:.4e}\n")

### Plot smt

In [None]:
hyperparameters = ["batch_size", "learning_rate", "split_count", "cnt_min", "cnt_max"]

# Построение графиков для accuracy
for param in hyperparameters:
    plt.figure(figsize=(8, 5))
    plt.scatter(df_results[param], df_results["accuracy"], alpha=0.7, label="Accuracy")
    plt.xlabel(param)
    plt.ylabel("Accuracy")
    plt.title(f"Accuracy vs {param}")
    plt.grid(True)
    plt.legend()
    plt.show()

In [None]:
for param in hyperparameters:
    plt.figure(figsize=(8, 5))
    plt.scatter(df_results[param], df_results["rankme"], alpha=0.7, label="RankMe", color="orange")
    plt.xlabel(param)
    plt.ylabel("RankMe")
    plt.title(f"RankMe vs {param}")
    plt.grid(True)
    plt.legend()
    plt.show()