In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import sys, os
from pathlib import Path

sys.path.insert(0, os.path.join(Path('.').resolve().parent.parent))

from fusionlib.model import predict # Функция, позволяет получить предсказание нейронки.
from fusionlib.check_budget import check_budget # функция проверки бюджета. Проверяйте допустимость решения до сабмита

In [2]:
clear_df = pd.read_csv('/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/data/validation_clear_transactions.csv')
clear_df.tail(5)

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
2123695,868878,5977,48,-4.504974,2020-10-31 12:20:28
2123696,868878,5411,48,-290.05145,2020-11-01 10:46:24
2123697,868878,6011,48,5304.162,2020-11-01 10:53:18
2123698,868878,5039,48,-3614.7114,2020-11-02 16:45:54
2123699,868878,5039,48,-4592.269,2020-11-16 17:36:50


In [3]:
fraud_df = pd.read_csv('/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/data/validation_fraud_transactions.csv')
fraud_df.tail(5)

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm
637195,868878,5411,48,-132.14314,2020-10-31 12:20:28
637196,868878,5699,48,-420.82767,2020-11-01 10:46:24
637197,868878,6011,48,5304.162,2020-11-01 10:53:18
637198,868878,5691,48,-1392.5892,2020-11-02 16:45:54
637199,868878,5651,48,-1497.7368,2020-11-16 17:36:50


In [4]:
clear_df_path = '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/data/validation_clear_transactions.csv'
fraud_df_path = '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/data/validation_fraud_transactions.csv'
bins_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/models/nn_bins.pickle" # путь до файла с бинами после тренировки модели (nn_bins.pickle)
model_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/models/nn_weights.ckpt" # путь до файла с весами нейронной сети (nn_weights.ckpt)
quantiles_path = "/Users/xxx/Documents/Programming/vtb-data-fusion-2023/vtb-data-fusion-2023-defence/misc/quantiles.json" # путь до файла с квантилями для таргета (quantiles.pickle)
BUDGET = 10 # разрешенное количество изменений транзакций для каждого пользователя

In [5]:
%%time
# у нас нет разметки для тех транзакций, которые мы атакуем - но у нас есть модель.
# Давайте посчитаем вероятность того, что пользователь принадлежит к классу 1
result_clear = predict(clear_df_path, bins_path, model_path, random_seed=20230206)
result_clear = result_clear.rename(columns={'target': 'clear_proba'})

Global seed set to 20230206


CPU times: user 31 s, sys: 6.79 s, total: 37.8 s
Wall time: 25.2 s


In [6]:
%%time
result_fraud = predict(fraud_df_path, bins_path, model_path, random_seed=20230206)
result_fraud = result_fraud.rename(columns={'target': 'fraud_proba'})

Global seed set to 20230206


CPU times: user 29.7 s, sys: 6.05 s, total: 35.8 s
Wall time: 21.5 s


In [17]:
targets_path = '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/data/target_finetune.csv'
targets = pd.read_csv(targets_path)

result = result_clear.merge(result_fraud, how='inner', left_on='user_id', right_on='user_id')
result = result.merge(targets, how='left', left_on='user_id', right_on='user_id')
result.head()

Unnamed: 0,user_id,clear_proba,fraud_proba,target
0,626,0.18711,0.314553,0
1,925,0.003806,0.011549,0
2,939,0.006336,0.019569,0
3,1158,0.060767,0.125721,0
4,1259,0.040798,0.121531,0


In [8]:
roc_aucs = []
skf = StratifiedKFold(n_splits=5)
for i, (train_index, test_index) in enumerate(skf.split(result, result.target)):
    test = result.iloc[test_index]
    roc_auc_original = roc_auc_score(test.target, test.clear_proba)
    roc_auc_attacked = roc_auc_score(test.target, test.fraud_proba)
    roc_auc = 2 / (1 / roc_auc_original  + 1 / roc_auc_attacked)
    roc_aucs.append(roc_auc)

In [9]:
np.mean(roc_aucs)

0.7052626665791947

In [10]:
from fusionlib.validation import validate_results
mean_harm_roc_auc = validate_results(result_clear, result_fraud, targets, n_splits=5)
print('-------------------- result -------------------- ')
print(f'mean_harm_roc_auc={mean_harm_roc_auc}')

Validation iter=0, roc_auc=0.7168839395598398
Validation iter=1, roc_auc=0.84083689352608
Validation iter=2, roc_auc=0.6489882309306312
Validation iter=3, roc_auc=0.6169727769969544
Validation iter=4, roc_auc=0.6644221565017926
-------------------- result -------------------- 
mean_harm_roc_auc=0.6976207995030597


In [13]:
public_mean_harm_roc_auc = 0.700928
print(f'Разница лидерборда и локальной валидации: {public_mean_harm_roc_auc - mean_harm_roc_auc}')

Разница лидерборда и локальной валидации: 0.0033072004969403324


In [12]:
# опционально добавил возможность использовать как pd.DataFrame, так и уже сохраненные csv
validate_results(result_clear, result_fraud, '/Users/xxx/Documents/Programming/vtb-data-fusion-2023/data/target_finetune.csv')

Validation iter=0, roc_auc=0.7168839395598398
Validation iter=1, roc_auc=0.84083689352608
Validation iter=2, roc_auc=0.6489882309306312
Validation iter=3, roc_auc=0.6169727769969544
Validation iter=4, roc_auc=0.6644221565017926


0.6976207995030597

# Метод скоринга кастомных predict методов

Чтобы снизить порог входа для данной задачи, был разработан метод оценки модели защиты, который позволяет не углубляться в механику атаки. Для оценки качества очередной версии своей модели необходимо лишь передать predict метод в соответствующую функцию.

In [16]:
def validate_model(predict_method: object, clear_df_path: str, 
                   fraud_df_path: str, bins_path: str, model_path: str, targets_path: str, random_seed=20230206, n_splits=5) -> float:
    # загрузка выбранных методов
    targets = pd.read_csv(targets_path)
    # получение скоров
    result_clear = predict_method(clear_df_path, bins_path, model_path, random_seed=random_seed)
    result_fraud = predict_method(fraud_df_path, bins_path, model_path, random_seed=random_seed)

    result_clear = result_clear.rename(columns={'target': 'clear_proba'})
    result_fraud = result_fraud.rename(columns={'target': 'fraud_proba'})
    
    mean_harm_roc_auc = validate_results(result_clear, result_fraud, targets, n_splits=n_splits)
    return mean_harm_roc_auc

In [18]:
validate_model(predict, clear_df_path, fraud_df_path, bins_path=bins_path, model_path=model_path, targets_path=targets_path)

Global seed set to 20230206
Global seed set to 20230206


Validation iter=0, roc_auc=0.7168839395598398
Validation iter=1, roc_auc=0.84083689352608
Validation iter=2, roc_auc=0.6489882309306312
Validation iter=3, roc_auc=0.6169727769969544
Validation iter=4, roc_auc=0.6644221565017926


0.6976207995030597

In [14]:
source_file = clear_df_path
df_transactions = (
        pd.read_csv(
            source_file,
            parse_dates=["transaction_dttm"],
            dtype={"user_id": int, "mcc_code": int, "currency_rk": int, "transaction_amt": float},
        )
        .dropna()
        .assign(
            hour=lambda x: x.transaction_dttm.dt.hour,
            day=lambda x: x.transaction_dttm.dt.dayofweek,
            month=lambda x: x.transaction_dttm.dt.month,
            number_day=lambda x: x.transaction_dttm.dt.day,
        )
    )

In [15]:
df_transactions

Unnamed: 0,user_id,mcc_code,currency_rk,transaction_amt,transaction_dttm,hour,day,month,number_day
1500,626,5912,48,-586.027470,2021-02-22 09:40:53,9,0,2,22
1501,626,5411,48,-50.992100,2021-02-22 19:56:32,19,0,2,22
1502,626,6011,48,-3318.621800,2021-02-23 06:29:28,6,1,2,23
1503,626,5411,48,-582.319640,2021-02-23 06:35:36,6,1,2,23
1504,626,5411,48,-461.845340,2021-02-23 19:31:53,19,1,2,23
...,...,...,...,...,...,...,...,...,...
2123695,868878,5977,48,-4.504974,2020-10-31 12:20:28,12,5,10,31
2123696,868878,5411,48,-290.051450,2020-11-01 10:46:24,10,6,11,1
2123697,868878,6011,48,5304.162000,2020-11-01 10:53:18,10,6,11,1
2123698,868878,5039,48,-3614.711400,2020-11-02 16:45:54,16,0,11,2
