In [10]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import TimeSeriesSplit, KFold
import os

In [3]:
data_dir = '/kaggle/input/ieee-fraud-detection'

In [4]:
train_identity_path = os.path.join(data_dir, "train_identity.csv")
train_transaction_path = os.path.join(data_dir, "train_transaction.csv")
test_identity_path = os.path.join(data_dir, "test_identity.csv")
test_transaction_path = os.path.join(data_dir, "test_transaction.csv")

In [5]:
train_identity = pl.read_csv(train_identity_path)
train_transaction = pl.read_csv(train_transaction_path)
test_identity = pl.read_csv(test_identity_path)
test_transaction = pl.read_csv(test_transaction_path)

In [6]:
train_df = train_transaction.join(train_identity,on='TransactionID',how='left')
test_df=test_transaction.join(test_identity,on='TransactionID',how='left')
print(f"Dataset sizes | train: {train_df.shape}, test: {test_df.shape}")

Dataset sizes | train: (590540, 434), test: (506691, 433)


# Feature Engineering

# Preparing data for modelling

In [23]:
many_null_cols_train = [col for col in train_df.columns if train_df[col].null_count() / train_df.shape[0] > 0.9]
many_null_cols_test = [col for col in test_df.columns if test_df[col].null_count() / test_df.shape[0] > 0.9]

def get_big_top_value_cols(df, threshold):
    big_top_cols = []
    for col in df.columns:
        val_counts = df[col].value_counts(sort=True)
        total = df.height
        top_freq = val_counts['count'][0] / total
        if top_freq > threshold:
            big_top_cols.append(col)
    return big_top_cols

big_top_value_cols_train = get_big_top_value_cols(train_df,0.9)
big_top_value_cols_test = get_big_top_value_cols(test_df,0.9)
    

In [28]:
cols_to_drop = list(set(many_null_cols_train + many_null_cols_test + big_top_value_cols_train + big_top_value_cols_test))
cols_to_drop.remove('isFraud')
cols_to_drop_train = [col for col in cols_to_drop if col in train_df.columns]
cols_to_drop_test = [col for col in cols_to_drop if col in test_df.columns]
len(cols_to_drop_train), len(cols_to_drop_test)

(82, 81)

In [33]:
cols_to_drop_diff = [col for col in cols_to_drop_train if col not in cols_to_drop_test]
cols_to_drop_diff.extend([col for col in cols_to_drop_test if col not in cols_to_drop_train])
cols_to_drop_diff

['id_24',
 'id_23',
 'id_25',
 'id_22',
 'id_27',
 'id_08',
 'id_18',
 'id_07',
 'id_21',
 'id_26',
 'id-07',
 'id-25',
 'id-24',
 'id-27',
 'id-22',
 'id-08',
 'id-21',
 'id-23',
 'id-26']

In [34]:
train_df = train_df.drop(cols_to_drop_train)
test_df = test_df.drop(cols_to_drop_test)

In [35]:
cat_cols = ['id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9',
            'P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3', 'R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
for col in cat_cols:
    if col in train.columns:
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))
        test[col] = le.transform(list(test[col].astype(str).values))   

NameError: name 'train' is not defined

# Modelling

In [None]:
n_fold = 5
folds = TimeSeriesSplit(n_splits = n_fold)
folds = KFold(n_splits = n_fold)