In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn import model_selection, preprocessing, metrics
import gc
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv


In [2]:
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2

def reduce_memory_usage(df, deep=True, verbose=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(df, deep=deep)

    for col, col_type in df.dtypes.iteritems():
        best_type = None
        if col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")
    
    if verbose:
        end_mem = memory_usage_mb(df, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
        
    return df

In [3]:
#############    LOAD TRAINING DATA     ###################

df_train_id = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')         # -- (144233, 41)
df_train_txn = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')     # -- (590540, 394)
df_train_id['IsTrain'] = True
df_train_txn['IsTrain'] = True

#############    LOAD TEST DATA     ###################

df_test_id = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')           # -- (141907, 41)
df_test_txn = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')       # -- (506691, 393)
df_test_id['IsTrain'] = False
df_test_txn['IsTrain'] = False
df_test_txn['isFraud'] = False
print("Data load complete...")

Data load complete...


In [4]:
df_id = pd.concat([df_train_id, df_test_id], sort=True)                # -- (286140, 42)
df_txn = pd.concat([df_train_txn, df_test_txn], sort=True)

del df_train_id, df_train_txn
del df_test_id, df_test_txn
gc.collect()

print(df_id.shape)
print(df_txn.shape)

(286140, 42)
(1097231, 395)


In [5]:
###################    ID DATA   #########################

df_id.loc[df_id['id_30'].str.contains('Mac', na=False, case=False), 'id_30'] = 'mac'
df_id.loc[df_id['id_30'].str.contains('iOS', na=False, case=False), 'id_30'] = 'iOS'
df_id.loc[df_id['id_30'].str.contains('Android', na=False, case=False), 'id_30'] = 'android'
df_id.loc[df_id['id_30'].str.contains('Windows', na=False, case=False), 'id_30'] = 'Windows'
df_id.loc[df_id['id_30'].str.contains('Linux', na=False, case=False), 'id_30'] = 'Linux'

print(round(len(df_id.loc[df_id['id_30'].isna()])/len(df_id) *100,2), "% Missing values")
df_id.id_30.unique() 

48.2 % Missing values


array(['android', 'iOS', nan, 'mac', 'Windows', 'Linux', 'func', 'other'],
      dtype=object)

In [6]:
df_id['id_31'] = df_id['id_31'].str.replace('\d+', '').str.replace('\.', '').str.strip()
df_id.loc[df_id['id_31'].str.contains('Samsung', na=False, case=False), 'id_31'] = 'Samsung'
df_id.loc[df_id['id_31'].str.contains('safari', na=False, case=False), 'id_31'] = 'Safari'
df_id.loc[df_id['id_31'].str.contains('chrome', na=False, case=False), 'id_31'] = 'Chrome'
df_id.loc[df_id['id_31'].str.contains('google', na=False, case=False), 'id_31'] = 'Google'
df_id.loc[df_id['id_31'].str.contains('Firefox', na=False, case=False), 'id_31'] = 'Firefox'
df_id.loc[df_id['id_31'].str.contains('ie', na=False, case=False), 'id_31'] = 'IE'
df_id.loc[df_id['id_31'].str.contains('Android', na=False, case=False), 'id_31'] = 'Android'


print(round(len(df_id.loc[df_id['id_31'].isna()])/len(df_id) *100, 2), "% Missing values")
df_id['id_31'] = df_id['id_31'].replace({ np.nan:'missing'})

df_id.loc[df_id['id_31'].apply(lambda x:x not in ['missing','Samsung','Safari','Chrome','Google','Firefox','IE','Android' ]),'id_31'] = 'Other'
df_id.id_31.unique()

3.23 % Missing values


array(['Samsung', 'Safari', 'Chrome', 'missing', 'Other', 'Firefox', 'IE',
       'Android', 'Google'], dtype=object)

In [7]:
df_id['device_name'] = df_id['DeviceInfo'].str.split('/', expand=True)[0]

df_id.loc[df_id['device_name'].str.contains('SM', na=False, case=False), 'device_name'] = 'Samsung'
df_id.loc[df_id['device_name'].str.contains('SAMSUNG', na=False, case=False), 'device_name'] = 'Samsung'
df_id.loc[df_id['device_name'].str.contains('GT-', na=False, case=False), 'device_name'] = 'Samsung'
df_id.loc[df_id['device_name'].str.contains('Moto G', na=False, case=False), 'device_name'] = 'Motorola'
df_id.loc[df_id['device_name'].str.contains('Moto', na=False, case=False), 'device_name'] = 'Motorola'
df_id.loc[df_id['device_name'].str.contains('moto', na=False, case=False), 'device_name'] = 'Motorola'
df_id.loc[df_id['device_name'].str.contains('LG-', na=False, case=False), 'device_name'] = 'LG'
df_id.loc[df_id['device_name'].str.contains('rv:', na=False, case=False), 'device_name'] = 'RV'
df_id.loc[df_id['device_name'].str.contains('HUAWEI', na=False, case=False), 'device_name'] = 'Huawei'
df_id.loc[df_id['device_name'].str.contains('ALE-', na=False, case=False), 'device_name'] = 'Huawei'
df_id.loc[df_id['device_name'].str.contains('-L', na=False, case=False), 'device_name'] = 'Huawei'
df_id.loc[df_id['device_name'].str.contains('Blade', na=False, case=False), 'device_name'] = 'ZTE'
df_id.loc[df_id['device_name'].str.contains('BLADE', na=False, case=False), 'device_name'] = 'ZTE'
df_id.loc[df_id['device_name'].str.contains('Linux', na=False, case=False), 'device_name'] = 'Linux'
df_id.loc[df_id['device_name'].str.contains('XT', na=False, case=False), 'device_name'] = 'Sony'
df_id.loc[df_id['device_name'].str.contains('HTC', na=False, case=False), 'device_name'] = 'HTC'
df_id.loc[df_id['device_name'].str.contains('ASUS', na=False, case=False), 'device_name'] = 'Asus'

df_id.loc[df_id.device_name.isin(df_id.device_name.value_counts()[df_id.device_name.value_counts() < 200].index), 'device_name'] = "Others"

print(round(len(df_id.loc[df_id['device_name'].isna()])/len(df_id) *100, 2), "% Missing values")
df_id.device_name.unique() 

18.32 % Missing values


array(['Samsung', 'iOS Device', 'Windows', nan, 'MacOS', 'ZTE', 'Sony',
       'Others', 'RV', 'LG', 'Trident', 'Huawei', 'Motorola', 'HTC',
       'Linux'], dtype=object)

In [8]:
#drop columns which have more than 50% values missing in Train Id. 
print(df_id.isna().sum()/len(df_id) *100)
drop_idcols = [col for col in df_id.columns if df_id[col].isna().sum()/len(df_id) * 100 >= 50]
drop_idcols.append('DeviceInfo')
df_id.drop(drop_idcols,axis=1, inplace=True)
df_id.shape     

DeviceInfo       18.318655
DeviceType        2.935276
IsTrain           0.000000
TransactionID     0.000000
id_01             0.000000
id_02             2.897882
id_03            53.587405
id_04            53.587405
id_05             5.076186
id_06             5.076186
id_07            96.430419
id_08            96.430419
id_09            47.835325
id_10            47.835325
id_11             2.930034
id_12             0.000000
id_13             9.972042
id_14            47.088488
id_15             2.858042
id_16            10.852380
id_17             3.776124
id_18            66.454183
id_19             3.814916
id_20             3.930244
id_21            96.429021
id_22            96.424478
id_23            96.424478
id_24            96.684490
id_25            96.445446
id_26            96.431817
id_27            96.424478
id_28             2.930034
id_29             2.930034
id_30            48.198784
id_31             0.000000
id_32            48.187251
id_33            49.688963
i

(286140, 30)

In [9]:
# Replace missing values :   numeric with mean    categorical with 'missing'

num_cols = df_id.select_dtypes(exclude = 'object').columns
cat_cols = df_id.select_dtypes(include = 'object').columns
print("Numeric Columns : ",num_cols)
print("Categorical Columns : ", cat_cols)

print(len(num_cols))
print(len(cat_cols))

df_id[cat_cols] = df_id[cat_cols].replace({ np.nan:'missing'})
df_id[num_cols] = df_id[num_cols].replace({ np.nan:-1})

Numeric Columns :  Index(['IsTrain', 'TransactionID', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09',
       'id_10', 'id_11', 'id_13', 'id_14', 'id_17', 'id_19', 'id_20', 'id_32'],
      dtype='object')
Categorical Columns :  Index(['DeviceType', 'id_12', 'id_15', 'id_16', 'id_28', 'id_29', 'id_30',
       'id_31', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'device_name'],
      dtype='object')
15
15


In [10]:
df_id.head(20)

Unnamed: 0,DeviceType,IsTrain,TransactionID,id_01,id_02,id_05,id_06,id_09,id_10,id_11,...,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,device_name
0,mobile,True,2987004,0.0,70787.0,-1.0,-1.0,-1.0,-1.0,100.0,...,android,Samsung,32.0,2220x1080,match_status:2,T,F,T,T,Samsung
1,mobile,True,2987008,-5.0,98945.0,0.0,-5.0,-1.0,-1.0,100.0,...,iOS,Safari,32.0,1334x750,match_status:1,T,F,F,T,iOS Device
2,desktop,True,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,100.0,...,missing,Chrome,-1.0,missing,missing,F,F,T,T,Windows
3,desktop,True,2987011,-5.0,221832.0,0.0,-6.0,-1.0,-1.0,100.0,...,missing,Chrome,-1.0,missing,missing,F,F,T,T,missing
4,desktop,True,2987016,0.0,7460.0,1.0,0.0,0.0,0.0,100.0,...,mac,Chrome,24.0,1280x800,match_status:2,T,F,T,T,MacOS
5,desktop,True,2987017,-5.0,61141.0,3.0,0.0,3.0,0.0,100.0,...,Windows,Chrome,24.0,1366x768,match_status:2,T,F,T,T,Windows
6,missing,True,2987022,-15.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,missing,missing,-1.0,missing,missing,missing,missing,missing,missing,missing
7,mobile,True,2987038,0.0,31964.0,0.0,-10.0,0.0,0.0,100.0,...,android,Chrome,32.0,1920x1080,match_status:2,T,F,T,T,missing
8,desktop,True,2987040,-10.0,116098.0,0.0,0.0,0.0,0.0,100.0,...,missing,Chrome,-1.0,missing,missing,F,F,T,T,Windows
9,desktop,True,2987048,-5.0,257037.0,0.0,0.0,-1.0,-1.0,100.0,...,missing,Chrome,-1.0,missing,missing,F,F,T,T,Windows


In [11]:
############    MERGE TRANSACTION AND ID DATA    #############
full_df = pd.merge(df_txn, df_id, on=['TransactionID','IsTrain'], how = 'left')
del df_txn, df_id
gc.collect()

18

In [12]:
#################        NEW FEATURES        #####################

full_df['TransactionAmt_decimal'] = ((full_df['TransactionAmt'] - full_df['TransactionAmt'].astype(int)) * 1000).astype(int)

full_df['uid'] = full_df['card1'].astype(str)+'_'+full_df['card2'].astype(str)
full_df['uid2'] = full_df['uid'].astype(str)+'_'+full_df['card3'].astype(str)+'_'+full_df['card5'].astype(str)
full_df['uid3'] = full_df['uid2'].astype(str)+'_'+full_df['addr1'].astype(str)+'_'+full_df['addr2'].astype(str)
print("Added TransactionAmt_decimal, uid, uid2, ui3")

Added TransactionAmt_decimal, uid, uid2, ui3


In [13]:
i_cols = ['card1','card2','card3','card5','uid','uid2','uid3']

for col in i_cols:
    for agg_type in ['mean','std']:
        new_col_name = col+'_TransactionAmt_'+agg_type
        temp_df = full_df[[col, 'TransactionAmt']]
        temp_df = temp_df.groupby([col])['TransactionAmt'].agg([agg_type]).reset_index().rename(
                                                columns={agg_type: new_col_name})

        temp_df.index = list(temp_df[col])
        temp_df = temp_df[new_col_name].to_dict()   

        full_df[new_col_name] = full_df[col].map(temp_df)
print("Added Mean, SD...")  

Added Mean, SD...


In [14]:
gc.collect()
reduce_memory_usage(full_df)

Column 'C1' converted from float64 to float32
Column 'C10' converted from float64 to float32
Column 'C11' converted from float64 to float32
Column 'C12' converted from float64 to float32
Column 'C13' converted from float64 to float32
Column 'C14' converted from float64 to float32
Column 'C2' converted from float64 to float32
Column 'C3' converted from float64 to float32
Column 'C4' converted from float64 to float32
Column 'C5' converted from float64 to float32
Column 'C6' converted from float64 to float32
Column 'C7' converted from float64 to float32
Column 'C8' converted from float64 to float32
Column 'C9' converted from float64 to float32
Column 'D1' converted from float64 to float32
Column 'D10' converted from float64 to float32
Column 'D11' converted from float64 to float32
Column 'D12' converted from float64 to float32
Column 'D13' converted from float64 to float32
Column 'D14' converted from float64 to float32
Column 'D15' converted from float64 to float32
Column 'D2' converted f

Unnamed: 0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,card3_TransactionAmt_mean,card3_TransactionAmt_std,card5_TransactionAmt_mean,card5_TransactionAmt_std,uid_TransactionAmt_mean,uid_TransactionAmt_std,uid2_TransactionAmt_mean,uid2_TransactionAmt_std,uid3_TransactionAmt_mean,uid3_TransactionAmt_std
0,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,185.236343,322.134460,257.916656,210.732864,257.916656,210.732864,193.000000,176.069595
1,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,212.793701,396.390228,213.629639,392.797211,213.629639,392.797211,239.981613,503.767456
2,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,98.774963,141.059906,104.827827,130.363129,104.827827,130.363129,65.685654,55.170208
3,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,147.653458,255.330368,124.389511,191.880905,120.967278,196.723221,120.967278,196.723221,113.086685,211.762115
4,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,212.793701,396.390228,99.811668,69.829735,99.811668,69.829735,50.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097226,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,40.112911,39.203312,109.730560,196.517548,36.284958,36.157349,36.284958,36.157349,40.029114,30.977364
1097227,1.0,2.0,1.0,1.0,3.0,1.0,3.0,0.0,1.0,0.0,...,40.112911,39.203312,109.730560,196.517548,34.741592,32.731827,34.741592,32.731827,34.096550,32.070923
1097228,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,141.865997,242.457291,137.141876,254.353577,137.141876,254.353577,128.512878,264.518860
1097229,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,147.653458,255.330368,109.730560,196.517548,87.250000,76.500000,87.250000,76.500000,202.000000,


In [15]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
num_cols = full_df.select_dtypes(include=numerics).columns        
cat_cols = [c for c in full_df.columns if (c not in num_cols)]    
print("Numeric columns : ", len(num_cols))
print("Categorical columns : ", len(cat_cols))

Numeric columns :  407
Categorical columns :  34


In [16]:
for f in cat_cols:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(full_df[f].values))
    full_df[f] = lbl.transform(list(full_df[f].values))
print("Label Encoding complete...")

Label Encoding complete...


In [17]:
full_df['TransactionAmt'] = np.log1p(full_df['TransactionAmt'])

In [18]:
#########      SPLIT TRAIN AND TEST        ##################

df_train = full_df[full_df.IsTrain == True]
df_test = full_df[full_df.IsTrain == False]

del df_train['IsTrain']
del df_test['IsTrain']
del df_test['isFraud']
del full_df
print(df_train.shape)
print(df_test.shape)
gc.collect()

(590540, 440)
(506691, 439)


0

In [19]:
pd.options.display.max_columns = 550
pd.options.display.max_rows = 550
print(df_train.isna().sum()/len(df_train) *100)

#Add to the drop_cols list, the columns which have more than 90% values missing. 
drop_cols = [col for col in df_train.columns if df_train[col].isna().sum()/len(df_train) * 100 >= 90]
drop_cols.append('TransactionDT')
print(drop_cols)

#df_train.drop(drop_cols,axis=1, inplace=True)
#df_test.drop(drop_cols,axis=1, inplace=True)
df_train = df_train.drop(drop_cols, axis=1)
df_test = df_test.drop(drop_cols, axis=1)

C1                            0.000000
C10                           0.000000
C11                           0.000000
C12                           0.000000
C13                           0.000000
C14                           0.000000
C2                            0.000000
C3                            0.000000
C4                            0.000000
C5                            0.000000
C6                            0.000000
C7                            0.000000
C8                            0.000000
C9                            0.000000
D1                            0.214888
D10                          12.873302
D11                          47.293494
D12                          89.041047
D13                          89.509263
D14                          89.469469
D15                          15.090087
D2                           47.549192
D3                           44.514851
D4                           28.604667
D5                           52.467403
D6                       

In [20]:
num_cols = df_train.select_dtypes(exclude = 'object').columns
cat_cols = df_train.select_dtypes(include = 'object').columns

exc_cols = ['device_name', 'TransactionAmt_decimal', 'uid', 'uid2', 'uid3', 'card1_TransactionAmt_mean',
 'card1_TransactionAmt_std', 'card2_TransactionAmt_mean', 'card2_TransactionAmt_std', 'card3_TransactionAmt_mean',
 'card3_TransactionAmt_std', 'card5_TransactionAmt_mean', 'card5_TransactionAmt_std',
 'uid_TransactionAmt_mean', 'uid_TransactionAmt_std', 'uid2_TransactionAmt_mean',
 'uid2_TransactionAmt_std', 'uid3_TransactionAmt_mean', 'uid3_TransactionAmt_std']

incl_cols = [x for x in num_cols if x not in exc_cols]

correlationMatrix = df_train.loc[:, incl_cols].corr().abs()
#plt.figure(figsize=(20,20))
#heat = sns.heatmap(data=correlationMatrix)
#plt.title('Heatmap of Correlation')
na_vals = np.sum(df_train.loc[:,incl_cols]==-1)/df_train.shape[0]
goodNumericVars = []
for i_var in incl_cols:    
    if na_vals[i_var] < 1:        
        goodNumericVars.append(i_var)
corrThresh = 0.9
# Select upper triangle of correlation matrix
upper = correlationMatrix.where(np.triu(np.ones(correlationMatrix.shape), k=1).astype(np.bool))
to_drop = [column for column in upper.columns if any(upper[column] > corrThresh)]

In [21]:
df_train = df_train.drop(to_drop, axis=1)
df_test = df_test.drop(to_drop, axis=1)

In [22]:
num_cols = df_train.select_dtypes(exclude = 'object').columns
cat_cols = df_train.select_dtypes(include = 'object').columns
df_train[cat_cols] = df_train[cat_cols].replace({ np.nan:'missing'})
df_train[num_cols] = df_train[num_cols].replace({ np.nan:-1})

num_cols = df_test.select_dtypes(exclude = 'object').columns
cat_cols = df_test.select_dtypes(include = 'object').columns
df_test[cat_cols] = df_test[cat_cols].replace({ np.nan:'missing'})
df_test[num_cols] = df_test[num_cols].replace({ np.nan:-1})

In [23]:
train_y = df_train['isFraud'].values
del df_train['isFraud']
gc.collect()
print(df_train.shape)
print(df_test.shape)

(590540, 224)
(506691, 224)


In [24]:
############           MODEL      ##################

lgb_param = {'num_leaves': 100,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 10,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 17,
         "metric": 'auc',
         "nthreads": 4,
         "verbosity": -1}

In [25]:
%%time

NFOLDS = 5
folds = KFold(n_splits=NFOLDS)

columns = df_train.columns
splits = folds.split(df_train, train_y)
y_preds = np.zeros(df_test.shape[0])
y_oof = np.zeros(df_train.shape[0])
score = 0

feature_importances = pd.DataFrame()
feature_importances['feature'] = columns

for fold_n, (train_index, valid_index) in enumerate(splits):
    X_train, X_valid = df_train[columns].iloc[train_index], df_train[columns].iloc[valid_index]
    y_train, y_valid = train_y[train_index], train_y[valid_index]
    
    dtrain = lgb.Dataset(X_train, label=y_train)
    dvalid = lgb.Dataset(X_valid, label=y_valid)

    clf = lgb.train(lgb_param, dtrain, 10000, valid_sets = [dtrain, dvalid], verbose_eval=200, early_stopping_rounds=500)
    
    feature_importances[f'fold_{fold_n + 1}'] = clf.feature_importance()
    
    y_pred_valid = clf.predict(X_valid)
    y_oof[valid_index] = y_pred_valid
    print(f"Fold {fold_n + 1} | AUC: {roc_auc_score(y_valid, y_pred_valid)}")
    
    score += roc_auc_score(y_valid, y_pred_valid) / NFOLDS
    y_preds += clf.predict(df_test) / NFOLDS
    
    del X_train, X_valid, y_train, y_valid
    gc.collect()
    
print(f"\nMean AUC = {score}")
print(f"Out of folds AUC = {roc_auc_score(train_y, y_oof)}")

Training until validation scores don't improve for 500 rounds.
[200]	training's auc: 0.929208	valid_1's auc: 0.877469
[400]	training's auc: 0.951728	valid_1's auc: 0.893648
[600]	training's auc: 0.962737	valid_1's auc: 0.904166
[800]	training's auc: 0.970182	valid_1's auc: 0.910772
[1000]	training's auc: 0.975649	valid_1's auc: 0.914576
[1200]	training's auc: 0.979802	valid_1's auc: 0.916582
[1400]	training's auc: 0.982815	valid_1's auc: 0.918071
[1600]	training's auc: 0.985528	valid_1's auc: 0.919258
[1800]	training's auc: 0.988096	valid_1's auc: 0.920233
[2000]	training's auc: 0.990002	valid_1's auc: 0.92143
[2200]	training's auc: 0.991605	valid_1's auc: 0.921764
[2400]	training's auc: 0.992912	valid_1's auc: 0.922152
[2600]	training's auc: 0.993955	valid_1's auc: 0.922417
[2800]	training's auc: 0.994889	valid_1's auc: 0.922615
[3000]	training's auc: 0.99568	valid_1's auc: 0.922915
[3200]	training's auc: 0.996296	valid_1's auc: 0.922769
[3400]	training's auc: 0.996831	valid_1's auc: 

In [26]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')

In [27]:
sub['isFraud'] = y_preds

In [28]:
sub.head(100)

Unnamed: 0,TransactionID,isFraud
0,3663549,0.001455
1,3663550,0.002278
2,3663551,0.001023
3,3663552,0.001176
4,3663553,0.002459
5,3663554,0.004841
6,3663555,0.009376
7,3663556,0.013725
8,3663557,0.000446
9,3663558,0.006985


In [29]:
sub.to_csv('lgbcv.csv', index=False)