Let's perform EDA and basic training using data from the [IEEE-CIS Fraud Detection](https://www.kaggle.com/competitions/ieee-fraud-detection/) Kaggle competition.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dataprep.eda import create_report

In [None]:
train_df = pd.read_csv("./data/train_transaction.csv")
test_df = pd.read_csv("./data/test_transaction.csv")

train_id = pd.read_csv("./data/train_identity.csv")
test_id = pd.read_csv("./data/test_identity.csv")

train_df = pd.merge(train_df, train_id, on="TransactionID", how="left")
test_df = pd.merge(test_df, test_id, on="TransactionID", how="left")

print(train_df.shape)
train_df.head()

In [None]:
is_na = train_df.isna().sum() / len(train_df)

print(f"{(is_na > 0).sum()} out of {len(is_na)} columns have missing values")
plt.figure(figsize=(15, 4))
ax = plt.bar(np.arange(len(is_na)), is_na.values)

There are variables with tons of missing data. We have to acount for this when creating our models. Let's take a look at the target variable.

In [None]:
ax = sns.countplot(data=train_df, x="isFraud")

frauds = sum(train_df["isFraud"])
print(f"Train: there are {frauds} frauds, {frauds/len(train_df):.3f} of total")

In [None]:
ax = sns.boxplot(data=train_df, x="isFraud", y="TransactionAmt")
_ = ax.set(ylim=(0, 1000))

In [None]:
fraud_per_productcd = pd.DataFrame(
    {"fraud_proportion": train_df.groupby("ProductCD")["isFraud"].mean()}
)
fraud_per_productcd = fraud_per_productcd.reset_index()
fraud_per_productcd.columns = ["ProductCD", "Fraud Proportion"]

ax = sns.barplot(fraud_per_productcd, x="ProductCD", y="Fraud Proportion")

According to information available [here](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203), card1-card6 are categorical variables. Let's convert all categorical variables to their appropriate dtype.

In [None]:
categorical_cols = [
    "ProductCD", "card1", "card2", "card3", "card4", "card5", "card6",
    "addr1", "addr2", "P_emaildomain", "R_emaildomain",
    "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9",
    "DeviceType", "DeviceInfo",
    "id_12", "id_13", "id_14", "id_15", "id_16", "id_17", "id_18", "id_19",
    "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27",
    "id_28", "id_29", "id_30", "id_31", "id_32", "id_33", "id_34", "id_35",
    "id_36", "id_37", "id_38",
]

for col in categorical_cols:
    train_df[col] = train_df[col].astype("category")
    test_df[col] = train_df[col].astype("category")

In [None]:
VARIABLE = "card4"

fraud_per_card = pd.DataFrame(
    {"fraud_proportion": train_df.groupby(VARIABLE)["isFraud"].mean()}
)
fraud_per_card = fraud_per_card.reset_index()
fraud_per_card.columns = [VARIABLE, "Fraud Proportion"]

ax = sns.barplot(fraud_per_card, x=VARIABLE, y="Fraud Proportion")

In [None]:
VARIABLE = "card6"

fraud_per_card = pd.DataFrame(
    {"fraud_proportion": train_df.groupby(VARIABLE)["isFraud"].mean()}
)
fraud_per_card = fraud_per_card.reset_index()
fraud_per_card.columns = [VARIABLE, "Fraud Proportion"]

ax = sns.barplot(fraud_per_card, x=VARIABLE, y="Fraud Proportion")

In [None]:
def plot_top_fraud_proportion(variable: str):
    fraud_per_card = train_df.groupby(variable)["isFraud"].agg(["mean", "count"])
    fraud_per_card = fraud_per_card.reset_index()
    fraud_per_card.columns = [variable, "Fraud Proportion", "Count"]
    fraud_per_card = fraud_per_card.nlargest(20, "Count")
    fraud_per_card[variable] = fraud_per_card[variable].cat.remove_unused_categories()

    plt.figure(figsize=(12, 3))
    ax = sns.barplot(data=fraud_per_card, x=variable, y="Fraud Proportion", order=fraud_per_card[variable])
    ax.tick_params(axis="x", rotation=90)

plot_top_fraud_proportion("card3")

In [None]:
plot_top_fraud_proportion("card5")

In [None]:
plot_top_fraud_proportion("P_emaildomain")

In [None]:
plot_top_fraud_proportion("R_emaildomain")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

y = train_df["isFraud"]
X = train_df.drop("isFraud", axis=1)
for col in categorical_cols:
    encoder = LabelEncoder()
    X[col] = encoder.fit_transform(X[col])

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer, roc_auc_score
import lightgbm as lgbm


FOLDS = 5


lgbm_params = {
    "n_estimators": 250,
    "random_state": 33,
    "verbose": 0,
}

model = lgbm.LGBMClassifier(**lgbm_params)

splitter = TimeSeriesSplit(n_splits=FOLDS)
score_mean = 0
count = 0
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.9)
for train_idx, val_idx in splitter.split(X_train, y_train):
    X_tr, X_vl = X_train.iloc[train_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model.fit(X_tr, y_tr)
    #y_pred_train = clf.predict_proba(X_vl)[:,1]
    #print(y_pred_train)
    score = make_scorer(roc_auc_score, needs_proba=True)(model, X_vl, y_vl)
    # plt.show()
    score_mean += score
    print(f'{count} CV - score: {round(score, 4)}')
    count += 1

print(f'Mean ROC AUC: {score_mean / FOLDS:.4f}')

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
score = roc_auc_score(y_val, y_pred)
print(f"Validation ROC AUC score: {score:.4f}")

In [None]:
_ = model.booster_.save_model("models/baseline_lgbm.txt")

In [None]:
# loading the model
model_ = lgbm.Booster(model_file="./models/baseline_lgbm.txt")
assert np.allclose(y_pred, model_.predict(X_val))