# Importing libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import os
import gc

# Loading data

In [None]:
# data_dir_1 = "/home/hxastur/vscode_projects/ieee-cis-fraud-detection/competition_data"
# data_dir_2 = "/home/hxastur/vscode-projects/ieee-cis-fraud-detection/competition_data"
# data_kaggle = '/kaggle/input/ieee-fraud-detection'

data_dir = "/home/hxastur/vscode_projects/ieee-cis-fraud-detection/competition_data"
data_dir_zip = f"{data_dir}/ieee-fraud-detection.zip"

In [None]:
train_identity_path = os.path.join(data_dir, "train_identity.csv")
train_transaction_path = os.path.join(data_dir, "train_transaction.csv")
test_identity_path = os.path.join(data_dir, "test_identity.csv")
test_transaction_path = os.path.join(data_dir, "test_transaction.csv")

In [None]:
train_identity = pl.read_csv(train_identity_path)
train_transaction = pl.read_csv(train_transaction_path)
test_identity = pl.read_csv(test_identity_path)
test_transaction = pl.read_csv(test_transaction_path)

In [None]:
train_df = train_transaction.join(train_identity, on="TransactionID", how="left")
test_df = test_transaction.join(test_identity, on="TransactionID", how="left")
print(f"Dataset sizes | train: {train_df.shape}, test: {test_df.shape}")

In [None]:
common_id_cols = set([c for c in train_df.columns]) & set([c for c in test_df.columns])
print(len(common_id_cols))

In [None]:
test_df = test_df.rename(
    {
        col: col.replace("-", "_")
        for col in test_df.columns
        if "-" in col and col.startswith("id")
    }
)
common_id_cols = set([c for c in train_df.columns]) & set([c for c in test_df.columns])
print(len(common_id_cols))

# Feature Engineering

In [None]:
# train_df = train_df.with_columns(
#     pl.col("id_30")
#     .str.split(" ")
#     .list.slice(0, pl.col("id_30").str.split(" ").list.len() - 1)
#     .list.join("_")
#     .alias("OS_id_30")
# )
# train_df = train_df.with_columns(
#     pl.col("id_30")
#     .str.split(" ")
#     .list.tail(1)
#     .list.get(0, null_on_oob=True)
#     .alias("Version_id_30")
# )
# train_df = train_df.with_columns(
#     pl.col("DeviceInfo").str.split("/").list.get(0).alias("DeviceName")
# )
# train_df = train_df.with_columns(
#     pl.col("DeviceInfo")
#     .str.split("/")
#     .list.get(1, null_on_oob=True)
#     .alias("DeviceVersion")
# )
# train_df.filter(pl.col("DeviceInfo").is_not_null())[
#     "DeviceInfo", "DeviceName", "DeviceVersion", "OS_id_30", "Version_id_30"
# ].head()

# Preparing data for modelling

In [None]:
many_null_cols_train = [
    col
    for col in train_df.columns
    if train_df[col].null_count() / train_df.shape[0] > 0.9
]
many_null_cols_test = [
    col for col in test_df.columns if test_df[col].null_count() / test_df.shape[0] > 0.9
]


def get_big_top_value_cols(df, threshold):
    big_top_cols = []
    for col in df.columns:
        val_counts = df[col].value_counts(sort=True)
        total = df.height
        top_freq = val_counts["count"][0] / total
        if top_freq > threshold:
            big_top_cols.append(col)
    return big_top_cols


big_top_value_cols_train = get_big_top_value_cols(train_df, 0.9)
big_top_value_cols_test = get_big_top_value_cols(test_df, 0.9)

In [None]:
cols_to_drop = list(
    set(
        many_null_cols_train
        + many_null_cols_test
        + big_top_value_cols_train
        + big_top_value_cols_test
    )
)
cols_to_drop.remove("isFraud")

In [None]:
train_df = train_df.drop(cols_to_drop)
test_df = test_df.drop(cols_to_drop)

In [None]:
cat_cols = [
    "id_12",
    "id_13",
    "id_14",
    "id_15",
    "id_16",
    "id_17",
    "id_18",
    "id_19",
    "id_20",
    "id_21",
    "id_22",
    "id_23",
    "id_24",
    "id_25",
    "id_26",
    "id_27",
    "id_28",
    "id_29",
    "id_30",
    "id_31",
    "id_32",
    "id_33",
    "id_34",
    "id_35",
    "id_36",
    "id_37",
    "id_38",
    "DeviceType",
    "DeviceInfo",
    "ProductCD",
    "card4",
    "card6",
    "M4",
    "P_emaildomain",
    "R_emaildomain",
    "card1",
    "card2",
    "card3",
    "card5",
    "addr1",
    "addr2",
    "M1",
    "M2",
    "M3",
    "M5",
    "M6",
    "M7",
    "M8",
    "M9",
    "P_emaildomain_1",
    "P_emaildomain_2",
    "P_emaildomain_3",
    "R_emaildomain_1",
    "R_emaildomain_2",
    "R_emaildomain_3",
]

In [None]:
for column in cat_cols:
    print(
        f"column: {column}, value counts shape: {train_df.select(pl.col(column).value_counts()).shape}"
    )

In [None]:
set(train_df["P_emaildomain"].value_counts()["P_emaildomain"]) ^ set(
    train_df["R_emaildomain"].value_counts()["R_emaildomain"]
)

In [None]:
train_df["R_emaildomain"].value_counts()

In [None]:
X = train_df.sort("TransactionDT").drop(["isFraud", "TransactionDT", "TransactionID"])
y = train_df.sort("TransactionDT")["isFraud"]
X_test = test_df.drop("TransactionDT", "TransactionID")
del train_df
test_df = [["TransactionDT"], ["TransactionID"]]

In [None]:
def clean_inf_nan(df):
    # return df.replace([np.inf, -np.inf], np.nan)
    float_cols = df.select(pl.col(pl.Float32, pl.Float64)).columns
    return df.with_columns(
        [
            pl.when(pl.col(col).is_infinite())
            .then(None)
            .otherwise(pl.col(col))
            .alias(col)
            for col in float_cols
        ]
    )


X = clean_inf_nan(X)
X_test = clean_inf_nan(X_test)

gc.collect()

# Modelling

In [None]:
# folds = TimeSeriesSplit(n_splits=n_fold)
# folds = KFold(n_splits=n_fold)

In [None]:
params = {
    "device": "gpu",
    "gpu_platform_id": 0,
    "gpu_device_id": 0,
    "num_leaves": 64,
    "min_child_weight": 0.03454472573214212,
    "feature_fraction": 0.3797454081646243,
    "bagging_fraction": 0.4181193142567742,
    "min_data_in_leaf": 200,
    "objective": "binary",
    "max_depth": 8,
    "learning_rate": 0.006883242363721497,
    "boosting_type": "gbdt",
    "bagging_seed": 11,
    "metric": "auc",
    "verbosity": -1,
    "reg_alpha": 0.3899927210061127,
    "reg_lambda": 0.6485237330340494,
    "random_state": 47,
}

n_fold = 5

In [None]:
def train_lgbm_cv(X_pd, y_pd, X_test_pd, params, n_fold=5, cat_cols_idx=None):
    folds = KFold(n_splits=n_fold, shuffle=True, random_state=42)
    y_preds = np.zeros(X_test_pd.shape[0])
    y_oof = np.zeros(X_pd.shape[0])
    scores = []

    feature_importances = pd.DataFrame({"feature": X_pd.columns})

    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_pd, y_pd)):
        X_train, X_valid = X_pd.iloc[train_index], X_pd.iloc[valid_index]
        y_train, y_valid = y_pd.iloc[train_index], y_pd.iloc[valid_index]

        # Создаём датасеты с категориями (если есть)
        train_data = lgb.Dataset(
            X_train,
            label=y_train,
            categorical_feature=cat_cols_idx,
            free_raw_data=False,  # ← важно для повторного использования
        )
        valid_data = lgb.Dataset(
            X_valid,
            label=y_valid,
            categorical_feature=cat_cols_idx,
            reference=train_data,
        )

        clf = lgb.train(
            params,
            train_data,
            num_boost_round=10000,
            valid_sets=[train_data, valid_data],
            callbacks=[lgb.log_evaluation(200), lgb.early_stopping(500, verbose=False)],
        )

        # Важность признаков
        feature_importances[f"fold_{fold_n + 1}"] = clf.feature_importance()

        # Предсказания
        y_pred_valid = clf.predict(X_valid)
        y_oof[valid_index] = y_pred_valid
        auc = roc_auc_score(y_valid, y_pred_valid)
        scores.append(auc)
        print(f"Fold {fold_n + 1} | AUC: {auc:.6f}")

        y_preds += clf.predict(X_test_pd) / n_fold

        # Очистка
        del clf, train_data, valid_data, X_train, X_valid, y_train, y_valid
        gc.collect()

    mean_auc = np.mean(scores)
    oof_auc = roc_auc_score(y_pd, y_oof)
    print(f"\nMean AUC = {mean_auc:.6f}")
    print(f"Out-of-Folds AUC = {oof_auc:.6f}")

    return y_preds, y_oof, feature_importances, mean_auc

In [None]:
# Предполагается, что n_fold определён ранее
n_fold = 5

# Сохранение сабмита
sub["isFraud"] = y_preds
sub.to_csv("submission.csv", index=False)

# Усреднение важности
fold_cols = [f"fold_{i}" for i in range(1, n_fold + 1)]
feature_importances["average"] = feature_importances[fold_cols].mean(axis=1)

# Сохранение важности
feature_importances.to_csv("feature_importances.csv", index=False)

# Визуализация
plt.figure(figsize=(16, 16))
sns.barplot(
    data=feature_importances.sort_values(by="average", ascending=False).head(50),
    x="average",
    y="feature",
)
plt.title(f"50 TOP feature importance over {n_fold} folds average")
plt.tight_layout()
plt.show()

In [None]:
# skf = StratifiedGroupKFold(n_splits=n_fold, shuffle=True, random_state=42)
# for train_idx, valid_idx in skf.split(train_df, train_df.isFraud.values):