<a href="https://www.kaggle.com/code/mmellinger66/ps3e4-eda-gbdts?scriptVersionId=118985412" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Playground Season 3: Episode 4 - EDA</h1>
</div>

This kernel will focus on Exploratory Data Analysis.


## Problem Type

Binary Classification

## Evaluation Metric

[AUC](https://www.analyticsvidhya.com/blog/2020/06/auc-roc-curve-machine-learning/)

## Resources

### Discussions

### Notebooks

- https://www.kaggle.com/code/parulpandey/10-simple-hacks-to-speed-up-your-data-analysis



<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

In [1]:
from typing import List, Set, Dict, Tuple, Optional

import os
import time
from pathlib import Path
import glob
import gc

import pandas as pd
import numpy as np

from sklearn import impute
from sklearn import metrics
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import svm
from sklearn import cluster
from sklearn import model_selection
from sklearn import ensemble
from sklearn import decomposition
from sklearn import datasets


import xgboost as xgb
import catboost as cb
import lightgbm as lgb


from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Visualization Libraries
import matplotlib as mpl
import matplotlib.pylab as plt
import seaborn as sns
import missingno as msno
from folium import Map
from folium.plugins import HeatMap
from IPython.display import display_html, display_markdown, display_latex
from colorama import Fore, Style

import warnings
warnings.filterwarnings('ignore')

import pandas_profiling


<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET="Class"

In [3]:
class Config:
    path:str = "../input/playground-series-s3e4/"
    gpu:bool = False
    optimize:bool = False
    n_optuna_trials:int = 10
    fast_render:bool = False
    calc_probability:bool = True
    debug:bool = False
    seed:int = 42
    N_ESTIMATORS:int = 100  # 100, 300, 1000, 2000, 5000, 15_000, 20_000 GBDT
    GPU_N_ESTIMATORS:int = 1000 # Want models to run fast during dev
    N_FOLDS:int = 5

<div class="alert alert-block alert-info">
<b>Tip:</b> Attemping to move as much into the Config class as possible
</div>

<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Theme</h1>
</div>

In [4]:
mpl.rcParams["font.size"] = 16

theme_colors = ["#44A7C4", "#20BEC7", "#38D3BB", "#73E4A3", "#B3F186", "#F9F871"]
theme_palette = sns.set_palette(sns.color_palette(theme_colors))

sns.palplot(sns.color_palette(theme_colors), size=0.8)
plt.tick_params(axis="both", labelsize=0, length=0)

plt.style.use('ggplot') # 'fivethirtyeight'

pd.options.display.float_format = '{:,.2f}'.format

In [5]:
class clr:
    S = Style.BRIGHT + Fore.LIGHTRED_EX
    E = Style.RESET_ALL

<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

Creating a few functions that will be reused in each project.

I need to be better with [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) on Kaggle.

In [6]:
def read_data(path: str, analyze:bool=True) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    if analyze:
        print(clr.S + "=== Shape of Data ==="+clr.E)
        print(f" train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
        print(f" test data : Rows={test.shape[0]}, Columns={test.shape[1]}")

        print(clr.S + "\n=== Train Data: First 5 Rows ===\n"+clr.E)
        display(train.head())
        print(f"\n{clr.S}=== Train Column Names ==={clr.E}\n")
        display(train.columns)
        print(f"\n{clr.S}=== Features/Explanatory Variables ==={clr.E}\n")
        eval_features(train)
        print(f"\n{clr.S}=== Skewness ==={clr.E}\n")
        check_skew(train)
    return train, test, submission_df

def create_submission(model_name: str, target, preds, seed:int=42, nfolds:int=5) -> pd.DataFrame:
    sample_submission[target] = preds

    if len(model_name) > 0:
        fname = f"submission_{model_name}_k{nfolds}_s{seed}.csv"
    else:
        fname = "submission.csv"

    sample_submission.to_csv(fname, index=False)

    return sample_submission

def show_classification_scores(ground_truth:List[int], yhat:List[int]) -> None:
    accuracy = metrics.accuracy_score(ground_truth, yhat)
    precision = metrics.precision_score(ground_truth, yhat)
    recall = metrics.recall_score(ground_truth, yhat)
    roc = metrics.roc_auc_score(ground_truth, yhat)
    f1 = metrics.f1_score(ground_truth, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"ROC: {roc:.4f}")
    print(f"f1: {f1:.4f}")
    

def label_encoder(train:pd.DataFrame, test:pd.DataFrame, columns:List[str]) -> (pd.DataFrame, pd.DataFrame) :
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = preprocessing.LabelEncoder().fit_transform(train[col])
        test[col] = preprocessing.LabelEncoder().fit_transform(test[col])
    return train, test   

def create_strat_folds(df:pd.DataFrame, TARGET, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"TARGET={TARGET}, n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
    # kf = GroupKFold(n_splits=Config.N_FOLDS)
    for fold, (train_idx, valid_idx) in enumerate(kf.split(df, df[TARGET])):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df


def create_folds(df:pd.DataFrame, n_folds:int=5, seed:int=42) -> pd.DataFrame:
    print(f"n_folds={n_folds}, seed={seed}")
    df["fold"] = -1

    kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=seed)

    for fold, (train_idx, valid_idx) in enumerate(kf.split(df)):
        df.loc[valid_idx, "fold"] = fold

    # df.to_csv(f"train_fold{num_folds}.csv", index=False)
    return df

def show_fold_scores(scores: List[float]) -> (float, float):
    cv_score = np.mean(scores)  # Used in filename
    std_dev = np.std(scores)
    print(
        f"Scores -> Adjusted: {np.mean(scores) - np.std(scores):.8f} , mean: {np.mean(scores):.8f}, std: {np.std(scores):.8f}"
    )
    return cv_score, std_dev


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(df.select_dtypes(include=['int64', 'float64', 'uint8']).columns)
    categorical_features = list(df.select_dtypes(include=['object', 'bool']).columns)
    if display:
        print(f"{clr.S}Continuous Features={continuous_features}{clr.E}\n")
        print(f"{clr.S}Categorical Features={categorical_features}{clr.E}")
    return continuous_features, categorical_features   

def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print("=== Cardinality ===")
    print(df[features].nunique())

## === Model Support ===    

from scipy.stats import mode


def merge_test_predictions(final_test_predictions:List[float], calc_probability:bool=True) -> List[float]:

    if Config.calc_probability:
        print("Mean")
        result = np.mean(np.column_stack(final_test_predictions), axis=1)
    else:
        print("Mode")
        mode_result = mode(np.column_stack(final_test_predictions), axis=1)
        result = mode_result[0].ravel()

    return result

def summary_statistics(X:pd.DataFrame, enhanced=True) -> None:
    desc = X.describe()
    if enhanced:
        desc.loc["var"] = X.var(numeric_only=True).tolist()
        desc.loc["skew"] = X.skew(numeric_only=True).tolist()
        desc.loc["kurt"] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context("display.precision", 2):
        style = desc.transpose().style.background_gradient(
            cmap="coolwarm"
        )  # .set_precision(4)
    display(style)
    
def show_missing_features(df:pd.DataFrame) -> None:
    missing_vals = df.isna().sum().sort_values(ascending=False)
    print(missing_vals[missing_vals > 0])


def show_duplicate_records(df:pd.DataFrame) -> None:
    dups = df.duplicated()
    print(dups.sum())


def eval_features(df:pd.DataFrame) -> (List[str], List[str], List[str]):
    ## Separate Categorical and Numerical Features
    categorical_features = list(
        df.select_dtypes(include=["category", "object"]).columns
    )
    continuous_features = list(df.select_dtypes(include=["number"]).columns)

    print(f"{clr.S}Continuous features:{clr.E} {continuous_features}")
    print(f"{clr.S}Categorical features:{clr.E} {categorical_features}")
    print("\n --- Cardinality of Categorical Features ---\n")

    for feature in categorical_features:
        cardinality = df[feature].nunique()
        if cardinality < 10:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}, {df[feature].unique()}")
        else:
            print(f"{clr.S}{feature}{clr.E}: cardinality={cardinality}")
    all_features = categorical_features + continuous_features
    return all_features, categorical_features, continuous_features


def show_feature_importance(feature_importance_lst:List[str]) -> None:
    fis_df = pd.concat(feature_importance_lst, axis=1)

    fis_df.sort_values("0_importance", ascending=True).head(40).plot(
        kind="barh", figsize=(12, 12), title="Feature Importance Across Folds"
    )
    plt.show()


def show_feature_target_crosstab(df:pd.DataFrame, feature_lst:List[str], target:str) -> None:
    for feature in feature_lst:
        print(f"\n=== {feature} vs {target} ===\n")
        display(
            pd.crosstab(df[feature], df[target], margins=True)
        )  # display keeps bold formatting


def show_cardinality(df:pd.DataFrame, features:List[str]) -> None:
    print(f"{clr.S}=== Cardinality ==={clr.E}")
    print(df[features].nunique())


def show_unique_features(df:pd.DataFrame, features:List[str]) -> None:
    for col in features:
        print(col, sorted(df[col].dropna().unique()))


def feature_distribution_types(df:pd.DataFrame, display:bool=True) -> (List[str], List[str]):
    continuous_features = list(
        df.select_dtypes(include=["int64", "float64", "uint8"]).columns
    )
    categorical_features = list(df.select_dtypes(include=["object", "bool"]).columns)
    if display:
        print(f"{clr.S}Continuous Features={clr.E}{continuous_features}\n")
        print(f"{clr.S}Categorical Features={clr.E}{categorical_features}")
    return continuous_features, categorical_features


def describe(X:pd.DataFrame) -> None:
    "Deprecated: Use summary_statistics()"
    desc = X.describe()
    desc.loc['var'] = X.var(numeric_only=True).tolist()
    desc.loc['skew'] = X.skew(numeric_only=True).tolist()
    desc.loc['kurt'] = X.kurtosis(numeric_only=True).tolist()

    with pd.option_context('display.precision', 2):
        style = desc.transpose().style.background_gradient(cmap='coolwarm') #.set_precision(4)
    display(style)
  

def check_skew(df:pd.DataFrame) -> None:
    skew = df.skew(skipna=True,numeric_only=True).sort_values(ascending=False)
    print(skew)
    
def gpu_ify_lgbm(lgbm_dict):
    lgbm_dict["device"] = "gpu"
    lgbm_dict["boosting_type"] = "gbdt"
    lgbm_dict["gpu_platform_id"] = 0
    lgbm_dict["gpu_device_id"] = 0
    return lgbm_dict

def gpu_ify_cb(params):
    params["task_type"] = "GPU"
    return params    


<div style="background-color:rgba(255,114,118, 0.9);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data and Analyze</h1>
</div>

## Load the following files

 - train.csv - Data used to build our machine learning model
 - test.csv - Data used to build our machine learning model. Does not contain the target variable
 - sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
%%time
train, test, sample_submission = read_data(Config.path, analyze=True)

[1m[91m=== Shape of Data ===[0m
 train data: Rows=219129, Columns=32
 test data : Rows=146087, Columns=31
[1m[91m
=== Train Data: First 5 Rows ===
[0m


Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,0.0,2.07,-0.13,-1.14,0.41,-0.19,-1.21,0.11,-0.26,...,-0.33,-0.89,0.34,-0.11,-0.29,0.21,-0.08,-0.06,1.98,0
1,1,0.0,2.0,-1.25,-0.52,-0.89,-1.12,-0.27,-1.03,0.05,...,0.05,-0.04,0.13,-0.46,-0.47,-0.46,-0.01,-0.04,84.0,0
2,2,0.0,0.09,1.0,-0.22,-0.44,0.67,-0.99,0.95,-0.08,...,-0.33,-0.8,0.15,0.95,-0.51,0.09,0.22,0.09,2.69,0
3,3,0.0,1.98,-0.18,-1.06,0.12,-0.22,-0.65,-0.09,-0.04,...,-0.1,-0.08,0.17,-0.04,0.0,-0.1,-0.06,-0.07,1.0,0
4,4,0.0,1.03,-0.17,1.2,1.24,-0.64,1.1,-0.94,0.57,...,0.1,0.61,0.03,-0.26,0.26,-0.25,0.11,0.02,1.0,0



[1m[91m=== Train Column Names ===[0m



Index(['id', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


[1m[91m=== Features/Explanatory Variables ===[0m

[1m[91mContinuous features:[0m ['id', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']
[1m[91mCategorical features:[0m []

 --- Cardinality of Categorical Features ---


[1m[91m=== Skewness ===[0m

Class    21.55
V28      12.83
Amount    9.05
V21       5.94
V20       1.78
V6        1.44
V10       1.30
V26       0.77
V5        0.55
V9        0.45
V23       0.32
V4        0.22
V13       0.11
V11       0.07
id        0.00
V19      -0.08
Time     -0.10
V18      -0.13
V22      -0.20
V7       -0.24
V24      -0.37
V25      -0.45
V16      -0.47
V15      -0.48
V17      -0.62
V14      -0.84
V12      -1.16
V3       -1.29
V1       -1.67
V27      -2.06
V2       -2.17
V8       -6.77
dtype: float64
CPU times: user 4.02 s, sys: 562 ms, total: 4.58 s
Wall time: 6.81 s


In [8]:
og = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
train['is_generated'] = 1
test['is_generated'] = 1
og['is_generated'] = 0

In [9]:
train = pd.concat([train, og],axis=0, ignore_index=True)

In [10]:
# %%time

# train = add_features(train)    
# test = add_features(test)

In [11]:
train.head()

Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V22,V23,V24,V25,V26,V27,V28,Amount,Class,is_generated
0,0.0,0.0,2.07,-0.13,-1.14,0.41,-0.19,-1.21,0.11,-0.26,...,-0.89,0.34,-0.11,-0.29,0.21,-0.08,-0.06,1.98,0,1
1,1.0,0.0,2.0,-1.25,-0.52,-0.89,-1.12,-0.27,-1.03,0.05,...,-0.04,0.13,-0.46,-0.47,-0.46,-0.01,-0.04,84.0,0,1
2,2.0,0.0,0.09,1.0,-0.22,-0.44,0.67,-0.99,0.95,-0.08,...,-0.8,0.15,0.95,-0.51,0.09,0.22,0.09,2.69,0,1
3,3.0,0.0,1.98,-0.18,-1.06,0.12,-0.22,-0.65,-0.09,-0.04,...,-0.08,0.17,-0.04,0.0,-0.1,-0.06,-0.07,1.0,0,1
4,4.0,0.0,1.03,-0.17,1.2,1.24,-0.64,1.1,-0.94,0.57,...,0.61,0.03,-0.26,0.26,-0.25,0.11,0.02,1.0,0,1


### Pandas Profiling

In [12]:
# train.profile_report()

## Categorical/Numerical Variables

In [13]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features.remove(TARGET)
cont_features.remove("id")
FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'is_generated']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'is_generated']

### Identify Numerical/Continuous that are Potentially Categorical

StandardHours, Over18, and EmployeeCount have only 1 value.  Should remove them.

In [14]:
sub25 = train.nunique()[train.nunique() < 25][:-1]
sub25

Class    2
dtype: int64

In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503936 entries, 0 to 503935
Data columns (total 33 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            219129 non-null  float64
 1   Time          503936 non-null  float64
 2   V1            503936 non-null  float64
 3   V2            503936 non-null  float64
 4   V3            503936 non-null  float64
 5   V4            503936 non-null  float64
 6   V5            503936 non-null  float64
 7   V6            503936 non-null  float64
 8   V7            503936 non-null  float64
 9   V8            503936 non-null  float64
 10  V9            503936 non-null  float64
 11  V10           503936 non-null  float64
 12  V11           503936 non-null  float64
 13  V12           503936 non-null  float64
 14  V13           503936 non-null  float64
 15  V14           503936 non-null  float64
 16  V15           503936 non-null  float64
 17  V16           503936 non-null  float64
 18  V17 

In [16]:
summary_statistics(train.drop(columns=["id"], axis=1), enhanced=True) # var,skew,kurtosis

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,var,skew,kurt
Time,503936.0,80709.36,42643.93,0.0,50820.0,72003.0,117290.0,172792.0,1818504609.77,0.44,-0.7
V1,503936.0,0.04,1.74,-56.41,-0.88,0.07,1.24,2.45,3.02,-3.06,32.11
V2,503936.0,0.02,1.46,-72.72,-0.59,0.06,0.81,22.06,2.13,-4.29,97.45
V3,503936.0,0.26,1.39,-48.33,-0.48,0.44,1.17,9.38,1.94,-2.09,23.7
V4,503936.0,0.03,1.35,-5.68,-0.82,0.02,0.84,16.88,1.82,0.51,2.04
V5,503936.0,-0.07,1.26,-113.74,-0.77,-0.13,0.5,34.8,1.58,-1.62,174.76
V6,503936.0,0.06,1.28,-26.16,-0.71,-0.2,0.44,73.3,1.64,1.67,29.9
V7,503936.0,-0.06,1.08,-43.56,-0.6,-0.03,0.49,120.59,1.16,2.21,408.81
V8,503936.0,0.07,1.02,-73.22,-0.16,0.07,0.36,20.01,1.03,-8.92,258.31
V9,503936.0,-0.02,1.08,-13.43,-0.67,-0.09,0.59,15.59,1.17,0.51,2.77


### TARGET Balance

In [17]:
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
train[TARGET].value_counts().plot(kind="pie",
                                           fontsize=16,
                                           labels=["", ""],
                                           ylabel="",
                                           autopct='%1.1f%%');

plt.subplot(1, 2, 2)
sns.countplot(x=TARGET, data=train, palette="viridis")
plt.show()

### Continuous Distributions

In [18]:
%%time

def plot_continuous_features(train:pd.DataFrame, test:pd.DataFrame, continuous_features:List[str], nrows:int, ncols:int):
    plt.figure()
    n_features = len(continuous_features)
    fig, ax = plt.subplots(8, 2, figsize=(20, 52))
    print(f"{n_features} Continous Features")
    for i, feature in enumerate(continuous_features):
#         print(f"Feature: {feature}")
        plt.subplot(nrows, ncols, i + 1)
        sns.histplot(
            x=feature, color="blue", kde=True, bins=10, label="train_" + feature, data=train
        )
        sns.histplot(
            x=feature, color="olive", kde=True, bins=10, label="test_" + feature, data=test
        )
        plt.xlabel(feature, fontsize=9)
        plt.legend()
    plt.show()
    
plot_continuous_features(train,test,cont_features,17,2)

31 Continous Features
CPU times: user 1min 32s, sys: 14.9 s, total: 1min 47s
Wall time: 1min 22s


### Categorical Distributions

In [19]:
plt.figure()
fig, ax = plt.subplots(4, 2, figsize=(15, 15))

n_features = len(cat_features)
print(f"{n_features} Categorical Features")

categorical_features = cat_features # 
for i, feature in enumerate(categorical_features):
    print(f"Feature: {feature}")
    plt.subplot(4, 2, i + 1)
    sns.countplot(
        x=feature, label="train_" + feature, data=train
    )
#     sns.countplot(
#         x=feature, color="olive", label="test_" + feature, data=test
#     )

#     sns.histplot(
#         test[feature], color="olive", kde=True, bins=10, label="test_" + feature
#     )
#     plt.xlabel(feature, fontsize=9)
#     plt.legend()
plt.show()

0 Categorical Features


### Look for Outliers

In [20]:
def plot_hist_bloxplot(df:pd.DataFrame, feature:str) -> None:
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(18, 4))

#     plt.figure(figsize=(6,4))
    fig.set_size_inches(10, 14)

    ax = df[feature].hist(
        bins=10,
        density=True,
        stacked=True,
        color="blue",
        alpha=0.6,
        ax=axs[0],
        figsize=(6,4)

    )
    df[feature].plot(
        kind="density",
        color="red",
        title=f"{feature} Distribution",
        ax=axs[0]
    )

    min = df[feature].min()
    max = df[feature].max()
    plt.xlim(min, max)

    # plt.figure(figsize=(12,4))
    fig.set_size_inches(11.7, 4.27)
    sns.boxplot(data=df, 
                x=feature,
                color=theme_colors[4],
                ax=axs[1],
    #             figsize=(12,4)
               )
    axs[1].set_title(f'{feature} Boxplot')
    plt.show()

cont_features2 = [feature for feature in cont_features if feature not in ["EmployeeCount", "Over18"]]

for feature in cont_features2:
    print(f"--- {feature} ---")
    plot_hist_bloxplot(train, feature)   

--- Time ---
--- V1 ---
--- V2 ---
--- V3 ---
--- V4 ---
--- V5 ---
--- V6 ---
--- V7 ---
--- V8 ---
--- V9 ---
--- V10 ---
--- V11 ---
--- V12 ---
--- V13 ---
--- V14 ---
--- V15 ---
--- V16 ---
--- V17 ---
--- V18 ---
--- V19 ---
--- V20 ---
--- V21 ---
--- V22 ---
--- V23 ---
--- V24 ---
--- V25 ---
--- V26 ---
--- V27 ---
--- V28 ---
--- Amount ---
--- is_generated ---


In [21]:
sns.violinplot(x='V1', inner='quartile', data=train);

In [22]:
#sns.violin_plots(train, TARGET, cat_features)

In [23]:
corr_matrix = train.corr()[[TARGET]].sort_values(by = [TARGET], ascending = False).drop([TARGET]).T
# with pd.option_context('display.precision', 2):
#     style = corr_matrix.transpose().style.background_gradient(cmap='coolwarm')
# display(style)
corr_matrix.style.background_gradient(cmap = 'coolwarm').set_precision(4)

Unnamed: 0,V11,V4,V2,V21,V20,V19,V28,V27,id,Amount,is_generated,V25,V22,V8,V15,V13,V26,V23,V24,Time,V6,V9,V18,V5,V1,V16,V7,V3,V10,V12,V17,V14
Class,0.0924,0.0875,0.0602,0.0279,0.0186,0.0153,0.0118,0.0104,0.0086,0.0075,0.0047,0.0023,0.0009,0.0006,0.0005,-0.0005,-0.0045,-0.0051,-0.0064,-0.0072,-0.026,-0.0468,-0.0524,-0.0524,-0.0759,-0.1128,-0.1246,-0.1321,-0.1358,-0.1497,-0.1847,-0.1901


### Correlation Heatmap

In [24]:
%%time

corr = train[cont_features].corr()
mask = np.triu(train[cont_features].corr())

sns.set(font_scale=1.1)
# plt.figure(figsize=(8, 8), dpi=240)
f,ax = plt.subplots(figsize=(20,20))
sns.heatmap(corr, 
            annot=True, 
            fmt='.2f', 
            cmap='PuBuGn', # ('PuBuGn', 'coolwarm', 'seismic')
            square=True, 
#             mask=mask, 
            linewidths=4,
            cbar=False,  # Show color bar at right
            cbar_kws={"shrink": .7},
            vmin = -1,
            vmax = 1,
            ax=ax
           )
plt.show()

CPU times: user 3.59 s, sys: 315 ms, total: 3.91 s
Wall time: 3.67 s


In [25]:
%%time
num_feats = list(train.select_dtypes(exclude=["object","bool"]))
num_feats.remove("id")

corr = train[num_feats].corr()
mask = np.triu(train[num_feats].corr())

sns.set(font_scale=1.1)
plt.figure(figsize=(20, 20), dpi=140)
sns.heatmap(corr, annot=True, fmt='.1f', 
            cmap='coolwarm', 
            square=True, 
            mask=mask, 
            linewidths=1,
            cbar=False)
plt.show()

CPU times: user 4.25 s, sys: 395 ms, total: 4.65 s
Wall time: 4.4 s


### Pair Plot

In [26]:
cont_features

['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'is_generated']

In [27]:
# %%time
# plt.figure(figsize=(15,8))

# sns.pairplot(
#     train,
#     vars=[
#         "WorkLifeBalance",
#         "Education",
#         "Age",
#         "YearsAtCompany",
#     ],
#     hue=TARGET,
# )
# plt.show()


## PCA

- https://www.kaggle.com/code/samuelcortinhas/ps-s3e3-hill-climbing-like-a-gm
- https://www.kaggle.com/competitions/playground-series-s3e3/discussion/380313

In [28]:
def show_pca(df:pd.DataFrame, target:str) -> None:
    scale = preprocessing.StandardScaler()
    X = scale.fit_transform(df)
    pca = decomposition.PCA()
    X_pca = pca.fit_transform(X)
    pca_df = pd.DataFrame(data = X_pca)

    variance = pd.DataFrame(pca.explained_variance_ratio_)
    display(np.cumsum(pca.explained_variance_ratio_))


    plt.figure(figsize=(12,8))
    plt.scatter(pca_df.iloc[:,0], pca_df.iloc[:,1], c=train[target], cmap="brg", s=40)
    plt.title('PCA plot in 2D')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.show()
    
show_pca(train[cont_features], TARGET)    

array([0.06294301, 0.12310437, 0.16446396, 0.20503374, 0.24508789,
       0.28355784, 0.31966119, 0.35539087, 0.39021967, 0.42454305,
       0.45827442, 0.49142285, 0.52439865, 0.55659   , 0.58815796,
       0.61961821, 0.65073013, 0.68151862, 0.71206111, 0.74246527,
       0.7724089 , 0.80202383, 0.83109698, 0.85954165, 0.88734923,
       0.91438384, 0.9408404 , 0.96450934, 0.98782954, 0.99858054,
       1.        ])

In [29]:
%%time
if not Config.fast_render:
    X = train[cont_features].copy()
    sns.pairplot(data=train[cont_features].drop(columns=[]), height=3, diag_kind="hist", palette=theme_palette)
    sns.pairplot(train, height=3, diag_kind="hist")
    plt.show()

CPU times: user 16min 56s, sys: 19.9 s, total: 17min 16s
Wall time: 17min 14s


## Feature Engineering

In [30]:
def add_features(df:pd.DataFrame) -> pd.DataFrame:
    df['hour'] = df['Time'] % (24 * 3600) // 3600
    df['day'] = (df['Time'] // (24 * 3600)) % 7
    
    return df

# https://www.kaggle.com/code/alexandershumilin/ps-s3-e4-ensemble-model

def across_col_feat(df:pd.DataFrame) -> pd.DataFrame:
    '''
    Calculates features across colums...
    '''
    features = [feat for feat in df.columns if 'V' in feat]
    df['V_Sum'] = df[features].sum(axis = 1)
    df['V_Min'] = df[features].min(axis = 1)
    df['V_Max'] = df[features].max(axis = 1)
    df['V_Avg'] = df[features].mean(axis = 1)
    df['V_Std'] = df[features].std(axis = 1)
    df['V_Pos'] = df[features].gt(0).sum(axis = 1)
    df['V_Neg'] = df[features].lt(0).sum(axis = 1)
    df['V_Range'] = abs(df['V_Min'] - df['V_Max'])

    return df

In [31]:
%%time

train = add_features(train)    
test = add_features(test)

CPU times: user 46.6 ms, sys: 985 µs, total: 47.6 ms
Wall time: 47 ms


In [32]:
%%time
train = across_col_feat(train)
test = across_col_feat(test)

CPU times: user 1.5 s, sys: 1.21 s, total: 2.72 s
Wall time: 2.72 s


### Encode Categorical Features

In [33]:
excluded_features = [TARGET, "id", "fold"]

In [34]:
cat_features

[]

In [35]:
train, test = label_encoder(train, test, cat_features)
# X_test = pd.get_dummies(test[FEATURES], drop_first=True)

train.head()

Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,hour,day,V_Sum,V_Min,V_Max,V_Avg,V_Std,V_Pos,V_Neg,V_Range
0,0.0,0.0,2.07,-0.13,-1.14,0.41,-0.19,-1.21,0.11,-0.26,...,0.0,0.0,-3.19,-1.21,2.07,-0.11,0.66,11,17,3.28
1,1.0,0.0,2.0,-1.25,-0.52,-0.89,-1.12,-0.27,-1.03,0.05,...,0.0,0.0,-1.69,-1.25,2.0,-0.06,0.75,9,19,3.25
2,2.0,0.0,0.09,1.0,-0.22,-0.44,0.67,-0.99,0.95,-0.08,...,0.0,0.0,-2.19,-0.99,1.0,-0.08,0.55,11,17,1.99
3,3.0,0.0,1.98,-0.18,-1.06,0.12,-0.22,-0.65,-0.09,-0.04,...,0.0,0.0,-0.63,-1.5,1.98,-0.02,0.65,9,19,3.48
4,4.0,0.0,1.03,-0.17,1.2,1.24,-0.64,1.1,-0.94,0.57,...,0.0,0.0,5.61,-0.94,1.39,0.2,0.65,16,12,2.33


In [36]:
cont_features, cat_features = feature_distribution_types(train, display=True)
show_cardinality(train, cat_features)

cont_features = [feature for feature in cont_features if feature not in excluded_features]
cat_features = [feature for feature in cat_features if feature not in excluded_features]

FEATURES = cont_features + cat_features
FEATURES

[1m[91mContinuous Features=[0m['id', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class', 'is_generated', 'hour', 'day', 'V_Sum', 'V_Min', 'V_Max', 'V_Avg', 'V_Std', 'V_Pos', 'V_Neg', 'V_Range']

[1m[91mCategorical Features=[0m[]
[1m[91m=== Cardinality ===[0m
Series([], dtype: float64)


['Time',
 'V1',
 'V2',
 'V3',
 'V4',
 'V5',
 'V6',
 'V7',
 'V8',
 'V9',
 'V10',
 'V11',
 'V12',
 'V13',
 'V14',
 'V15',
 'V16',
 'V17',
 'V18',
 'V19',
 'V20',
 'V21',
 'V22',
 'V23',
 'V24',
 'V25',
 'V26',
 'V27',
 'V28',
 'Amount',
 'is_generated',
 'hour',
 'day',
 'V_Sum',
 'V_Min',
 'V_Max',
 'V_Avg',
 'V_Std',
 'V_Pos',
 'V_Neg',
 'V_Range']

In [37]:
# train = create_folds(train, Config.N_FOLDS)
train = create_strat_folds(train, TARGET, Config.N_FOLDS)

TARGET=Class, n_folds=5, seed=42


In [38]:
lgbm_params01 = {
    'objective': 'binary',
     'metric': 'auc',
     'feature_pre_filter': False,
     'lambda_l1': 1.9488299167684667e-07,
     'lambda_l2': 9.456184670156514,
     'num_leaves': 6,
     'feature_fraction': 0.8,
     'bagging_fraction': 0.8065,
     'bagging_freq': 4,
     'min_child_samples': 10,
     'num_iterations': 400,
     'learning_rate':0.05
}

if Config.gpu:
    lgbm_params01 = gpu_ify_lgbm(lgbm_params01)


In [39]:
lgbm_params = {
    'objective': 'binary', # regression, auc
    'metric': 'auc',
    "n_estimators": Config.N_ESTIMATORS, # N_ESTIMATORS, GPU_N_ESTIMATORS
    'max_depth': 9,
    'learning_rate': 0.01,
    'min_data_in_leaf': 36, 
    'num_leaves': 100, 
    'feature_fraction': 0.8, 
    'bagging_fraction': 0.89, 
    'bagging_freq': 5, 
    'lambda_l2': 10,
    
    'seed': Config.seed,
#     'boosting_type': 'gbdt',
#     'device': 'gpu', 
#     'gpu_platform_id': 0,
#     'gpu_device_id': 0,
    'n_jobs': -1,
#    'metric': 'rmse',
    'verbose': -1
}

if Config.gpu:
    lgbm_params = gpu_ify_lgbm(lgbm_params)

In [40]:
cb_params = {
#     "objective": "binary",
    "eval_metric": "AUC",
    "learning_rate": 0.05,
    "l2_leaf_reg": 3.1572972266001518,
    "bagging_temperature": 0.6799604234141348,
    "random_strength": 1.99590400593318,
    "depth": 9,
    "min_data_in_leaf": 93,
    "iterations": Config.N_ESTIMATORS, #Config.N_ESTIMATORS,GPU_N_ESTIMATORS
    "use_best_model": True,
    #     "task_type": "GPU",
    "random_seed": Config.seed,
}

if Config.gpu:
    cb_params = gpu_ify_cb(cb_params)
#     cb_params["task_type"] = "GPU"

In [41]:
xgb_params = {'n_estimators': Config.N_ESTIMATORS,
                 'max_depth': 3,
                 'learning_rate': 0.1,
                 'min_child_weight': 4,
                 'subsample': 0.7,
                 'colsample_bytree': 0.3
             }

if Config.optimize:
    xgb_params = best_lgbm_params


In [42]:
model_reg_dict = {
    "lgbm1": lgb.LGBMRegressor(**lgbm_params),


}

In [43]:
model_clf_dict = {
    "xgb1": xgb.XGBClassifier(**xgb_params),
    "lgbm1": lgb.LGBMClassifier(**lgbm_params),
    "lgbm2": lgb.LGBMClassifier(**lgbm_params01),
    "cat1": cb.CatBoostClassifier(**cb_params),
    "log_reg": linear_model.LogisticRegression(),
    "log_reg2": linear_model.LogisticRegression(
        max_iter=1000, C=0.0001, penalty="l2", solver="newton-cg"
    ),
    "log_reg3": linear_model.LogisticRegression(
        max_iter=1000, C=0.2, penalty="l1", solver = 'saga'
    ),
    "svc": svm.SVC(C = 100, gamma = 1, kernel = 'rbf', probability = True),
    "rfc": ensemble.RandomForestClassifier(max_depth = 7, min_samples_leaf = 5, min_samples_split = 2, n_estimators = 300)

}

In [44]:
all_cv_scores = pd.DataFrame(
    {
        "Model": pd.Series(dtype="str"),
        "Score": pd.Series(dtype="float"),
        "StdDev": pd.Series(dtype="float"),
        "RunTime": pd.Series(dtype="float"),
    }
)

oof = train[["id", TARGET, "fold"]].copy().reset_index(drop=True).copy()
oof.set_index("id", inplace=True)
oof.head()

Unnamed: 0_level_0,Class,fold
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0,1
1.0,0,2
2.0,0,3
3.0,0,3
4.0,0,3


In [45]:
def show_tree_model_fi(model, features:List[str]) -> None:
    print("\n=== Model Feature Importance ===")
    for i in model.feature_importances_.argsort()[::-1]:
        print(features[i], model.feature_importances_[i]/model.feature_importances_.sum())

def save_oof_predictions(model_name:str, final_valid_predictions, oof:pd.DataFrame) -> pd.DataFrame:
    final_valid_predictions_df = process_valid_predictions(
        final_valid_predictions, "id", model_name
    )
    display(final_valid_predictions_df.head())
    oof[f"pred_{model_name}"] = final_valid_predictions_df[f"pred_{model_name}"]

    return oof

def save_test_predictions(model_name:str, final_test_predictions, submission_df:pd.DataFrame, result_field:str=TARGET) -> None:
    result = merge_test_predictions(final_test_predictions, Config.calc_probability)
    # result[:20]
    submission_df[f"target_{model_name}"] = result
    #     submission_df.head(10)
    ss = submission_df[["id", f"target_{model_name}"]].copy().reset_index(drop=True)
    ss.rename(columns={f"target_{model_name}": result_field}, inplace=True)
    ss.to_csv(
        f"submission_{model_name}.csv", index=False
    )  # Can submit the individual model
    ss.head(10)

def process_valid_predictions(final_valid_predictions, train_id, model_name:str) -> pd.DataFrame:
    model = f"pred_{model_name}"
    final_valid_predictions_df = pd.DataFrame.from_dict(
        final_valid_predictions, orient="index"
    ).reset_index()
    final_valid_predictions_df.columns = [train_id, model]
    final_valid_predictions_df.set_index(train_id, inplace=True)
    final_valid_predictions_df.sort_index(inplace=True)
    final_valid_predictions_df.to_csv(f"train_pred_{model_name}.csv", index=True)

    return final_valid_predictions_df

def add_score(score_df:pd.DataFrame, model_name:str, score:float, std:float):
    dict1 = {"Model": model_name, "Score": cv_score, "StdDev": std_dev}
    score_df = score_df.append(dict1, ignore_index=True)
    return score_df

In [46]:
def train_xgb_model(
    df:pd.DataFrame,
    test:pd.DataFrame,
    get_model_fn,
    FEATURES:List[str],
    TARGET:str,
    calc_probability:bool,
    rowid:str,
    params,
    n_folds:int=5,
    seed:int=42,
):

    print(params)
    final_test_predictions = []
    final_valid_predictions = {}
    fold_scores = []  # Scores of Validation Set
    feature_importance_lst = []

    test = test[FEATURES].copy()

    for fold in range(n_folds):
        print(10 * "=", f"Fold {fold+1}/{n_folds}", 10 * "=")

        start_time = time.time()

        xtrain = df[df.fold != fold].reset_index(
            drop=True
        )  # Everything not in validation fold
        xvalid = df[df.fold == fold].reset_index(drop=True)
        xtest = test.copy()

        valid_ids = xvalid.id.values.tolist()  # Id's of everything in validation fold

        ytrain = xtrain[TARGET]
        yvalid = xvalid[TARGET]

        xtrain = xtrain[FEATURES]
        xvalid = xvalid[FEATURES]

        model = get_model_fn # (params)

        model.fit(
            xtrain,
            ytrain,
            eval_set=[(xvalid, yvalid)],
            #             eval_metric="acc",  # auc
            verbose=False,
            #             early_stopping_rounds=3000,
            #             callbacks=[
            #                 xgb.log_evaluation(0),
            #                 xgb.early_stopping(500, False, True),
            #             ],
        )

        if calc_probability:
            preds_valid = model.predict_proba(xvalid)[:, 1]
            test_preds = model.predict_proba(xtest)[:, 1]
        else:
            preds_valid = model.predict(xvalid)
            test_preds = model.predict(xtest)

        preds_valid_class = model.predict(xvalid)
        
        final_test_predictions.append(test_preds)
        if Config.debug:
            print(f"GT Type: {type(yvalid.values)}")
            print(f"Preds Type: {type(preds_valid_class)}")
            print(f"         GT:{yvalid.values[:20]}")
            print(f"Preds Class:{preds_valid_class[:20]}")
            print(f"Preds Prob:{preds_valid[:20]}")
        final_valid_predictions.update(dict(zip(valid_ids, preds_valid_class)))

        fold_score = metrics.roc_auc_score(yvalid.values, preds_valid)  # Validation Set Score
        show_classification_scores(yvalid.values, preds_valid_class)
#         fold_score = metrics.mean_absolute_error(
#             yvalid, preds_valid
#         )  # Validation Set Score
#         fold_score = metrics.mean_squared_error(yvalid, preds_valid, squared=False)
        fold_scores.append(fold_score)

        # Feature importance
        fi = pd.DataFrame(
            index=FEATURES,
            data=model.feature_importances_,
            columns=[f"{fold}_importance"],
        )
        feature_importance_lst.append(fi)

        run_time = time.time() - start_time

        print(f"fold: {fold+1}, Score: {fold_score}, Run Time: {run_time:.2f}")

    return (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    )        

In [47]:
def run_tree_model(model_dict, model_name:str, features:List[str], params, oof:pd.DataFrame) -> (float, float, pd.DataFrame):
    (
        model,
        feature_importance_lst,
        fold_scores,
        final_valid_predictions,
        final_test_predictions,
    ) = train_xgb_model(
        train,
        test,
        model_dict[model_name],
        features,
        TARGET,
        Config.calc_probability,
        "id",
        params,
        Config.N_FOLDS,
        Config.seed,
    )

    cv_score, std_dev = show_fold_scores(fold_scores)
    show_tree_model_fi(model, features)
    show_feature_importance(feature_importance_lst)
    
    oof = save_oof_predictions(model_name, final_valid_predictions, oof)
    oof.head()
    save_test_predictions(model_name, final_test_predictions, sample_submission, TARGET)

    return cv_score, std_dev, oof

In [48]:
%%time

def run_models4features(model_dict, model_lst:List[str], target:str, feature_lst:List[str], all_cv_scores:pd.DataFrame, linear_models:bool=True) -> pd.DataFrame:

    oof = train[["id", target, "fold"]].copy().reset_index(drop=True).copy()
    oof.set_index("id", inplace=True)

    for idx, m in enumerate(model_lst):
        model = model_lst[idx]
        start_time = time.time()

        print(f"Model={model}")

        params = {}
        if linear_models:
                cv_score, std_dev, oof = run_linear_model(model_dict, model, feature_lst, oof)

        else:
            cv_score, std_dev, oof = run_tree_model(model_dict, model, feature_lst, params, oof)

        run_time = time.time() - start_time

        score_dict = {"Model": model, "Score": cv_score, "StdDev": std_dev, "RunTime": run_time}
        all_cv_scores = all_cv_scores.append(score_dict, ignore_index=True)
        print(f"Model Run Time: {run_time:.2f}")

    return all_cv_scores




CPU times: user 18 µs, sys: 0 ns, total: 18 µs
Wall time: 22.9 µs


## Tree Models

In [49]:
%%time
# model_lst = ["cat2","lgbm2", "xgbr","lgbm1", "cat1"]
model_lst = ["xgb1", "lgbm2", "lgbm1", "cat1"]

all_cv_scores = run_models4features(model_clf_dict, model_lst, TARGET, FEATURES, all_cv_scores, linear_models=False)    

Model=xgb1
{}
Accuracy: 0.9987
Precision: 0.8875
Recall: 0.3679
ROC: 0.6839
f1: 0.5201
fold: 1, Score: 0.9326893069140171, Run Time: 203.80
Accuracy: 0.9988
Precision: 0.8953
Recall: 0.4010
ROC: 0.7005
f1: 0.5540
fold: 2, Score: 0.9283840316781815, Run Time: 470.46
Accuracy: 0.9988
Precision: 0.8817
Recall: 0.4271
ROC: 0.7135
f1: 0.5754
fold: 3, Score: 0.9235444159335289, Run Time: 912.93
Accuracy: 0.9987
Precision: 0.8718
Recall: 0.3542
ROC: 0.6770
f1: 0.5037
fold: 4, Score: 0.9246251988170385, Run Time: 1197.13
Accuracy: 0.9987
Precision: 0.8438
Recall: 0.4219
ROC: 0.7109
f1: 0.5625
fold: 5, Score: 0.9239018982885167, Run Time: 1401.56
Scores -> Adjusted: 0.92314258 , mean: 0.92662897, std: 0.00348639

=== Model Feature Importance ===
V14 0.08808211
V12 0.08552677
V17 0.06293762
V10 0.043838613
V16 0.040337473
is_generated 0.04023394
V9 0.038054902
V_Std 0.033895403
V_Sum 0.03097106
V4 0.029745346
V_Min 0.029566295
V_Range 0.029420273
V11 0.026261574
V_Avg 0.026235338
V6 0.02490579
V

Unnamed: 0_level_0,pred_xgb1
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mean
Model Run Time: 4187.72
Model=lgbm2
{}
Accuracy: 0.9987
Precision: 0.8795
Recall: 0.3782
ROC: 0.6891
f1: 0.5290
fold: 1, Score: 0.9310012678449238, Run Time: 183.30
Accuracy: 0.9988
Precision: 0.9080
Recall: 0.4115
ROC: 0.7057
f1: 0.5663
fold: 2, Score: 0.939697446029458, Run Time: 388.36
Accuracy: 0.9987
Precision: 0.8000
Recall: 0.4375
ROC: 0.7186
f1: 0.5657
fold: 3, Score: 0.9300445163775535, Run Time: 649.65
Accuracy: 0.9987
Precision: 0.8537
Recall: 0.3646
ROC: 0.6822
f1: 0.5109
fold: 4, Score: 0.9317823015557434, Run Time: 1174.89
Accuracy: 0.9988
Precision: 0.8400
Recall: 0.4375
ROC: 0.7187
f1: 0.5753
fold: 5, Score: 0.932840691634773, Run Time: 1973.15
Scores -> Adjusted: 0.92963618 , mean: 0.93307324, std: 0.00343706

=== Model Feature Importance ===
is_generated 0.0595
V4 0.05
V26 0.0485
V14 0.0455
V6 0.04
Amount 0.0395
Time 0.036
V12 0.034
V3 0.0315
V19 0.0315
V22 0.0315
V8 0.031
V16 0.031
V24 0.031
V_Min 0.03
V7 0.0295
V18 0.0295
V10 0.029
V_Std 0.0255
V1 0.0235
V21 0.

Unnamed: 0_level_0,pred_lgbm2
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mean
Model Run Time: 4371.18
Model=lgbm1
{}
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 1, Score: 0.9193185005177742, Run Time: 174.30
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 2, Score: 0.9306246841708501, Run Time: 407.77
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 3, Score: 0.92031467456136, Run Time: 672.14
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 4, Score: 0.9239903304504864, Run Time: 987.51
Accuracy: 0.9981
Precision: 0.0000
Recall: 0.0000
ROC: 0.5000
f1: 0.0000
fold: 5, Score: 0.9210710594877147, Run Time: 1086.05
Scores -> Adjusted: 0.91897530 , mean: 0.92306385, std: 0.00408855

=== Model Feature Importance ===
V4 0.048763106993790085
V26 0.048254097526213985
V3 0.044182021785605215
V14 0.04072075740608775
V12 0.0367504835589942
Amount 0.03664868166547898
Time 0.035732464623842004
V24 0.03532525704978112
V15 0.03400183243408327
V7 0.

Unnamed: 0_level_0,pred_lgbm1
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mean
Model Run Time: 3329.46
Model=cat1
{}
Accuracy: 0.9988
Precision: 0.9595
Recall: 0.3679
ROC: 0.6839
f1: 0.5318
fold: 1, Score: 0.9327163995985543, Run Time: 170.32
Accuracy: 0.9988
Precision: 0.9750
Recall: 0.4062
ROC: 0.7031
f1: 0.5735
fold: 2, Score: 0.938950432427059, Run Time: 343.79
Accuracy: 0.9988
Precision: 0.9318
Recall: 0.4271
ROC: 0.7135
f1: 0.5857
fold: 3, Score: 0.9297628071309045, Run Time: 534.43
Accuracy: 0.9987
Precision: 0.9583
Recall: 0.3594
ROC: 0.6797
f1: 0.5227
fold: 4, Score: 0.9265486501151481, Run Time: 937.36
Accuracy: 0.9989
Precision: 0.9529
Recall: 0.4219
ROC: 0.7109
f1: 0.5848
fold: 5, Score: 0.9303876828702553, Run Time: 1214.63
Scores -> Adjusted: 0.92753533 , mean: 0.93167319, std: 0.00413786

=== Model Feature Importance ===
is_generated 0.25916679370089013
V4 0.04753010886207308
V_Std 0.04441843470331207
V14 0.0438454338601277
Amount 0.04188130569042274
V26 0.0370107548049223
V_Range 0.03253600111090919
V8 0.031760408092350166
Time 0.029525542271

Unnamed: 0_level_0,pred_cat1
id,Unnamed: 1_level_1
0.0,0
1.0,0
2.0,0
3.0,0
4.0,0


Mean
Model Run Time: 3202.27
CPU times: user 4h 30min 41s, sys: 44.8 s, total: 4h 31min 26s
Wall time: 4h 11min 30s


In [50]:
all_cv_scores.sort_values(by=["Score"], ascending=False)

Unnamed: 0,Model,Score,StdDev,RunTime
1,lgbm2,0.93,0.0,4371.18
3,cat1,0.93,0.0,3202.27
0,xgb1,0.93,0.0,4187.72
2,lgbm1,0.92,0.0,3329.46
