# Importing libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import TimeSeriesSplit, KFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import os
import gc

# Loading data

In [None]:
# data_dir_1 = "/home/hxastur/vscode_projects/ieee-cis-fraud-detection/competition_data"
# data_dir_2 = "/home/hxastur/vscode-projects/ieee-cis-fraud-detection/competition_data"
# data_kaggle = '/kaggle/input/ieee-fraud-detection'

data_dir = "/home/hxastur/vscode_projects/ieee-cis-fraud-detection/competition_data"
data_dir_zip = f"{data_dir}/ieee-fraud-detection.zip"

In [None]:
train_identity_path = os.path.join(data_dir, "train_identity.csv")
train_transaction_path = os.path.join(data_dir, "train_transaction.csv")
test_identity_path = os.path.join(data_dir, "test_identity.csv")
test_transaction_path = os.path.join(data_dir, "test_transaction.csv")

In [None]:
train_identity = pl.read_csv(train_identity_path)
train_transaction = pl.read_csv(train_transaction_path)
test_identity = pl.read_csv(test_identity_path)
test_transaction = pl.read_csv(test_transaction_path)

In [None]:
train_df = train_transaction.join(train_identity, on="TransactionID", how="left")
test_df = test_transaction.join(test_identity, on="TransactionID", how="left")
print(f"Dataset sizes | train: {train_df.shape}, test: {test_df.shape}")

In [None]:
common_id_cols = set([c for c in train_df.columns]) & set([c for c in test_df.columns])
print(len(common_id_cols))

In [None]:
test_df = test_df.rename(
    {
        col: col.replace("-", "_")
        for col in test_df.columns
        if "-" in col and col.startswith("id")
    }
)
common_id_cols = set([c for c in train_df.columns]) & set([c for c in test_df.columns])
print(len(common_id_cols))

# Feature Engineering

# Preparing data for modelling

In [None]:
many_null_cols_train = [
    col
    for col in train_df.columns
    if train_df[col].null_count() / train_df.shape[0] > 0.9
]
many_null_cols_test = [
    col for col in test_df.columns if test_df[col].null_count() / test_df.shape[0] > 0.9
]


def get_big_top_value_cols(df, threshold):
    big_top_cols = []
    for col in df.columns:
        val_counts = df[col].value_counts(sort=True)
        total = df.height
        top_freq = val_counts["count"][0] / total
        if top_freq > threshold:
            big_top_cols.append(col)
    return big_top_cols


big_top_value_cols_train = get_big_top_value_cols(train_df, 0.9)
big_top_value_cols_test = get_big_top_value_cols(test_df, 0.9)

In [None]:
cols_to_drop = list(
    set(
        many_null_cols_train
        + many_null_cols_test
        + big_top_value_cols_train
        + big_top_value_cols_test
    )
)
cols_to_drop.remove("isFraud")

In [None]:
train_df = train_df.drop(cols_to_drop)
test_df = test_df.drop(cols_to_drop)

In [None]:
cat_cols = [
    "id_12",
    "id_13",
    "id_14",
    "id_15",
    "id_16",
    "id_17",
    "id_18",
    "id_19",
    "id_20",
    "id_21",
    "id_22",
    "id_23",
    "id_24",
    "id_25",
    "id_26",
    "id_27",
    "id_28",
    "id_29",
    "id_30",
    "id_31",
    "id_32",
    "id_33",
    "id_34",
    "id_35",
    "id_36",
    "id_37",
    "id_38",
    "DeviceType",
    "DeviceInfo",
    "ProductCD",
    "card4",
    "card6",
    "M4",
    "P_emaildomain",
    "R_emaildomain",
    "card1",
    "card2",
    "card3",
    "card5",
    "addr1",
    "addr2",
    "M1",
    "M2",
    "M3",
    "M5",
    "M6",
    "M7",
    "M8",
    "M9",
    "P_emaildomain_1",
    "P_emaildomain_2",
    "P_emaildomain_3",
    "R_emaildomain_1",
    "R_emaildomain_2",
    "R_emaildomain_3",
]

In [None]:
for col in cat_cols:
    if col in train_df.columns and col in test_df.columns:
        combined = pl.concat(
            [train_df.select(col), test_df.select(col)], how="vertical"
        )
        mapping = (
            combined.with_row_index()
            .unique(subset=col, keep="first")
            .with_columns(pl.col("index").cast(pl.UInt32).alias(f"{col}_encoded"))
            .select(col, f"{col}_encoded")
        )

        train_df = (
            train_df.join(mapping, on=col, how="left")
            .drop(col)
            .rename({f"{col}_encoded": col})
        )
        test_df = (
            test_df.join(mapping, on=col, how="left")
            .drop(col)
            .rename({f"{col}_encoded": col})
        )

In [None]:
X = train_df.sort("TransactionDT").drop(["isFraud", "TransactionDT", "TransactionID"])
y = train_df.sort("TransactionDT")["isFraud"]
X_test = test_df.drop("TransactionDT", "TransactionID")
del train_df
test_df = [["TransactionDT"], ["TransactionID"]]

In [None]:
def clean_inf_nan(df):
    # return df.replace([np.inf, -np.inf], np.nan)
    float_cols = df.select(pl.col(pl.Float32, pl.Float64)).columns
    return df.with_columns(
        [
            pl.when(pl.col(col).is_infinite())
            .then(None)
            .otherwise(pl.col(col))
            .alias(col)
            for col in float_cols
        ]
    )


X = clean_inf_nan(X)
X_test = clean_inf_nan(X_test)

gc.collect()

# Modelling

In [None]:
n_fold = 5
folds = TimeSeriesSplit(n_splits=n_fold)
folds = KFold(n_splits=n_fold)

In [None]:
param_lgb = {
    # "min_data_in_leaf": int(LGB_BO.max["params"]["min_data_in_leaf"]),
    # "num_leaves": int(LGB_BO.max["params"]["num_leaves"]),
    # #'learning_rate': LGB_BO.max['params']['learning_rate'],
    # "min_child_weight": LGB_BO.max["params"]["min_child_weight"],
    # "bagging_fraction": LGB_BO.max["params"]["bagging_fraction"],
    # "feature_fraction": LGB_BO.max["params"]["feature_fraction"],
    # "reg_lambda": LGB_BO.max["params"]["reg_lambda"],
    # "reg_alpha": LGB_BO.max["params"]["reg_alpha"],
    # "max_depth": int(LGB_BO.max["params"]["max_depth"]),
    "objective": "binary",
    "save_binary": True,
    "seed": 1337,
    "feature_fraction_seed": 1337,
    "bagging_seed": 1337,
    "drop_seed": 1337,
    "data_random_seed": 1337,
    "boosting_type": "gbdt",
    "verbose": 1,
    "is_unbalance": False,
    "boost_from_average": True,
    "metric": "auc",
}

In [None]:
skf = StratifiedGroupKFold(n_splits=n_fold, shuffle=True, random_state=42)
# for train_idx, valid_idx in skf.split(train_df, train_df.isFraud.values):