In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import sys, os
from tqdm import tqdm

from pathlib import Path

sys.path.insert(0, os.path.join(Path('.').resolve().parent.parent))

from fusionlib.predicts import predict # Функция, позволяет получить предсказание нейронки.
from fusionlib.check_budget import check_budget # функция проверки бюджета. Проверяйте допустимость решения до сабмита

In [2]:
BUDGET = 10
from fusionlib.utils import custom_read_csv
from fusionlib.validation import validate_results
import json

def validate_model_(predict_method: object, validation_df_path: str, bins_path: str, 
                    model_path: str, targets_path: str, quantiles_path: str, random_seed=20230206, n_splits=5) -> float:
    
    # загрузка необходимых данных
    targets = pd.read_csv(targets_path)
    with open(quantiles_path, 'r') as f:
        quantiles = json.load(f)
    df_transactions = custom_read_csv(validation_df_path)
    # получение скоров
    result_clear = predict_method(validation_df_path, bins_path, model_path, random_seed=random_seed)
    threshold = result_clear.target.max() / 2 
    poor_user = result_clear.user_id.loc[result_clear.target.argmin()]
    hero_user = result_clear.user_id.loc[result_clear.target.argmax()]


    one_idx = result_clear.index[result_clear.target > threshold]  # Эти пользователи похожи на Героя
    zero_idx = result_clear.index[result_clear.target <= threshold] # А эти на Неудачника

    users = result_clear.user_id.values
    one_users = users[one_idx]
    zero_users = users[zero_idx]

    for user in tqdm(users):
        if user in one_users:
            copy_from = poor_user # похожим на Героя скопируем 10 последних транзакций Неудачника
        else:
            copy_from = hero_user # А похожим на Неудачника наоборот

        idx_to = df_transactions.index[df_transactions.user_id == user][-BUDGET:]
        idx_from = df_transactions.index[df_transactions.user_id == copy_from][-BUDGET:]
        sign_to = np.sign(df_transactions.loc[idx_to, "transaction_amt"].values)
        sign_from = np.sign(df_transactions.loc[idx_from, "transaction_amt"].values)
        sign_mask = (sign_to == sign_from)
        df_transactions.loc[idx_to[sign_mask], "mcc_code"] = df_transactions.loc[idx_from[sign_mask], "mcc_code"].values
        df_transactions.loc[idx_to[sign_mask], "transaction_amt"] = df_transactions.loc[idx_from[sign_mask], "transaction_amt"].values
    
    fraud_df = df_transactions
    result_fraud = predict_method(fraud_df, bins_path, model_path, random_seed=random_seed)

    result_clear = result_clear.rename(columns={'target': 'clear_proba'})
    result_fraud = result_fraud.rename(columns={'target': 'fraud_proba'})
    
    mean_harm_roc_auc = validate_results(result_clear, result_fraud, targets, n_splits=n_splits)
    return mean_harm_roc_auc

In [3]:
validation_df_path = '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/data/validation_clear_transactions.csv'
bins_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/models/nn_bins.pickle" # путь до файла с бинами после тренировки модели (nn_bins.pickle)
model_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/models/nn_weights.ckpt" # путь до файла с весами нейронной сети (nn_weights.ckpt)
quantiles_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/misc/quantiles.json" # путь до файла с квантилями для таргета (quantiles.pickle)
targets_path = '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/data/target_finetune.csv'


In [4]:
from fusionlib.predicts import reliable_predict

In [5]:
%%time
mean_harm_roc_auc = validate_model_(reliable_predict, validation_df_path, bins_path, model_path, targets_path, quantiles_path)

Global seed set to 20230206
100%|██████████| 2124/2124 [00:34<00:00, 60.74it/s]
Global seed set to 20230206


CPU times: user 9min 9s, sys: 1min 52s, total: 11min 2s
Wall time: 6min 33s


In [6]:
public_mean_harm_roc_auc = 0.705638	
print(f'Разница лидерборда и локальной валидации: {round(abs(public_mean_harm_roc_auc - mean_harm_roc_auc), 4)}')

Разница лидерборда и локальной валидации: 0.0103


In [1]:
mean_harm_roc_auc

NameError: name 'mean_harm_roc_auc' is not defined

In [9]:
%%time
validate_model_(predict, validation_df_path, bins_path, model_path, targets_path, quantiles_path)

Global seed set to 20230206
100%|██████████| 2124/2124 [00:30<00:00, 68.72it/s]
Global seed set to 20230206


CPU times: user 1min 27s, sys: 15.7 s, total: 1min 43s
Wall time: 1min 17s


0.6981571189787755