### I only made minimal changes (4 lines of code added) to the original [AMEX catboost notebook](https://www.kaggle.com/code/huseyincot/amex-catboost-0-793) Please upvote the original one made by https://www.kaggle.com/huseyincot 

### The idea is simple. Kaggle Grandmasters [RADDAR](https://www.kaggle.com/code/raddar/the-data-has-random-uniform-noise-added/notebook) and [Chris](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327651) have observed that random noise has been added to data, whose magnitude is about `[0,0.01]`. Therefore we can simply round the data to the 2nd decimals to "reduce" this noise. It is helpful for tree models since they don't need to search for better splits within that noisy range `[0,0.01]` and thus reduces overfitting.

### CV score is improved to `0.7923` from `0.7905` and LB score is improved to `0.794` from `0.793`

# Import Libraries

In [1]:
import pandas as pd
import numpy as np
import cupy, cudf
import gc
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

Aggregated data are shared in dataset: https://www.kaggle.com/datasets/huseyincot/amex-agg-data-pickle
Data created with following code: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created

In [2]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_pickle(path)[usecols]
    else: df = pd.read_pickle(path)
    df = cudf.from_pandas(df)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    #df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(-127) 
    print('shape of data:', df.shape)
    
    return df

In [3]:
features = pd.read_pickle('../input/amex-efv2/important_feature_60_2.pkl')
print(len(features))
features = [column for column in features if 'B_29' not in column]
print(len(features))

860
855


In [4]:
TRAIN_PATH = '../input/amex-bruteforce-features/train_v3_loaded.pkl'
train = read_file(path = TRAIN_PATH, usecols=features+['customer_ID','target'])
TEST_PATH = '../input/amex-bruteforce-features/test_v3_loaded.pkl'
test = read_file(path = TEST_PATH,usecols=features+['customer_ID'])

shape of data: (458913, 857)
shape of data: (924621, 856)


### The only change I made is the cell below.

In [5]:
for col in test.columns:
    if test[col].dtype=='float16':
        train[col] = train[col].astype('float32').round(decimals=2).astype('float16')
        test[col] = test[col].astype('float32').round(decimals=2).astype('float16')

# Competition Metric

In [6]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x == 0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [7]:
train_y = pd.DataFrame(train["target"].to_pandas())
train_x = train[features].to_pandas()

# Model Training

In [8]:
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
cat_features = [f"{cf}_last" for cf in cat_features]
cat_features = cat_features.remove('D_68_last')

In [9]:
N_FOLDS = 5
oof = []
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=22)
y_oof = np.zeros(train_x.shape[0])
y_test = np.zeros(test.shape[0])
ix = 0
for train_ind, val_ind in skf.split(train_x, train_y):
    print(f"******* Fold {ix} ******* ")
    tr_x, val_x = (
        train_x.iloc[train_ind].reset_index(drop=True),
        train_x.iloc[val_ind].reset_index(drop=True),
    )
    tr_y, val_y = (
        train_y.iloc[train_ind].reset_index(drop=True),
        train_y.iloc[val_ind].reset_index(drop=True),
    )

    clf = CatBoostClassifier(iterations = 9999, random_state=22, early_stopping_rounds=200, task_type = 'GPU')
    clf.fit(tr_x, tr_y, eval_set=[(val_x, val_y)], cat_features=cat_features,  verbose=100)
    preds = clf.predict_proba(val_x)[:, 1]
    #y_oof[val_ind] = y_oof[val_ind] + preds
    clf.save_model(f'cat_boost_fold{ix}.cbm')
    #preds_test = clf.predict_proba(test[features].to_pandas())[:, 1]
    #y_test = y_test + preds_test / N_FOLDS
    ix = ix + 1
    
    del tr_x, val_x, tr_y, val_y, clf
    _ = gc.collect()
    
    df = train.loc[val_ind, ['customer_ID','target'] ].copy()
    df['prediction'] = preds
    oof.append( df )
    
    del preds, df
    _ = gc.collect()
    
oof = cudf.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
#val_score = amex_metric(train_y, oof)
#print(f"Amex metric: {val_score}")

******* Fold 0 ******* 




Learning rate set to 0.016616
0:	learn: 0.6723346	test: 0.6723945	best: 0.6723945 (0)	total: 39.1ms	remaining: 6m 31s
100:	learn: 0.2571320	test: 0.2592921	best: 0.2592921 (100)	total: 2.43s	remaining: 3m 58s
200:	learn: 0.2358722	test: 0.2388252	best: 0.2388252 (200)	total: 4.83s	remaining: 3m 55s
300:	learn: 0.2296410	test: 0.2331811	best: 0.2331811 (300)	total: 7.39s	remaining: 3m 58s
400:	learn: 0.2262711	test: 0.2302773	best: 0.2302773 (400)	total: 9.67s	remaining: 3m 51s
500:	learn: 0.2239257	test: 0.2284083	best: 0.2284083 (500)	total: 12s	remaining: 3m 46s
600:	learn: 0.2221616	test: 0.2270845	best: 0.2270845 (600)	total: 14.2s	remaining: 3m 42s
700:	learn: 0.2207466	test: 0.2260942	best: 0.2260942 (700)	total: 16.8s	remaining: 3m 43s
800:	learn: 0.2195188	test: 0.2252808	best: 0.2252808 (800)	total: 19.1s	remaining: 3m 39s
900:	learn: 0.2184766	test: 0.2246527	best: 0.2246527 (900)	total: 21.3s	remaining: 3m 35s
1000:	learn: 0.2175599	test: 0.2241616	best: 0.2241616 (1000)	tot



Learning rate set to 0.016616
0:	learn: 0.6723003	test: 0.6722925	best: 0.6722925 (0)	total: 25.5ms	remaining: 4m 15s
100:	learn: 0.2579035	test: 0.2568929	best: 0.2568929 (100)	total: 2.57s	remaining: 4m 12s
200:	learn: 0.2368730	test: 0.2362659	best: 0.2362659 (200)	total: 5.06s	remaining: 4m 6s
300:	learn: 0.2305542	test: 0.2303648	best: 0.2303648 (300)	total: 7.34s	remaining: 3m 56s
400:	learn: 0.2270700	test: 0.2272343	best: 0.2272343 (400)	total: 9.59s	remaining: 3m 49s
500:	learn: 0.2247595	test: 0.2252921	best: 0.2252921 (500)	total: 11.8s	remaining: 3m 44s
600:	learn: 0.2230414	test: 0.2239491	best: 0.2239491 (600)	total: 14.4s	remaining: 3m 45s
700:	learn: 0.2216244	test: 0.2229138	best: 0.2229138 (700)	total: 16.6s	remaining: 3m 40s
800:	learn: 0.2204110	test: 0.2221094	best: 0.2221094 (800)	total: 18.8s	remaining: 3m 36s
900:	learn: 0.2193387	test: 0.2214253	best: 0.2214253 (900)	total: 21s	remaining: 3m 32s
1000:	learn: 0.2184323	test: 0.2209182	best: 0.2209182 (1000)	tota



Learning rate set to 0.016616
0:	learn: 0.6723199	test: 0.6723011	best: 0.6723011 (0)	total: 25.5ms	remaining: 4m 14s
100:	learn: 0.2574919	test: 0.2588925	best: 0.2588925 (100)	total: 2.75s	remaining: 4m 29s
200:	learn: 0.2362985	test: 0.2383707	best: 0.2383707 (200)	total: 5.08s	remaining: 4m 7s
300:	learn: 0.2299593	test: 0.2326001	best: 0.2326001 (300)	total: 7.35s	remaining: 3m 56s
400:	learn: 0.2264917	test: 0.2296629	best: 0.2296629 (400)	total: 9.64s	remaining: 3m 50s
500:	learn: 0.2241372	test: 0.2277686	best: 0.2277686 (500)	total: 11.9s	remaining: 3m 46s
600:	learn: 0.2223772	test: 0.2264369	best: 0.2264369 (600)	total: 14.5s	remaining: 3m 46s
700:	learn: 0.2209605	test: 0.2254813	best: 0.2254813 (700)	total: 16.8s	remaining: 3m 42s
800:	learn: 0.2197428	test: 0.2246840	best: 0.2246840 (800)	total: 19s	remaining: 3m 38s
900:	learn: 0.2186639	test: 0.2240280	best: 0.2240280 (900)	total: 21.3s	remaining: 3m 35s
1000:	learn: 0.2176979	test: 0.2234974	best: 0.2234974 (1000)	tota



Learning rate set to 0.016616
0:	learn: 0.6718860	test: 0.6718654	best: 0.6718654 (0)	total: 26.9ms	remaining: 4m 29s
100:	learn: 0.2576900	test: 0.2588482	best: 0.2588482 (100)	total: 2.41s	remaining: 3m 55s
200:	learn: 0.2362336	test: 0.2379157	best: 0.2379157 (200)	total: 4.71s	remaining: 3m 49s
300:	learn: 0.2300865	test: 0.2320398	best: 0.2320398 (300)	total: 6.99s	remaining: 3m 45s
400:	learn: 0.2266600	test: 0.2289008	best: 0.2289008 (400)	total: 9.57s	remaining: 3m 49s
500:	learn: 0.2243332	test: 0.2269121	best: 0.2269121 (500)	total: 11.9s	remaining: 3m 45s
600:	learn: 0.2225633	test: 0.2255061	best: 0.2255061 (600)	total: 14.1s	remaining: 3m 41s
700:	learn: 0.2211756	test: 0.2244754	best: 0.2244754 (700)	total: 16.4s	remaining: 3m 37s
800:	learn: 0.2199497	test: 0.2236598	best: 0.2236598 (800)	total: 19s	remaining: 3m 37s
900:	learn: 0.2188883	test: 0.2229972	best: 0.2229972 (900)	total: 21.2s	remaining: 3m 34s
1000:	learn: 0.2179717	test: 0.2224868	best: 0.2224868 (1000)	tot



Learning rate set to 0.016616
0:	learn: 0.6722563	test: 0.6722496	best: 0.6722496 (0)	total: 25.9ms	remaining: 4m 18s
100:	learn: 0.2582535	test: 0.2576921	best: 0.2576921 (100)	total: 2.42s	remaining: 3m 56s
200:	learn: 0.2367914	test: 0.2363306	best: 0.2363306 (200)	total: 5.04s	remaining: 4m 5s
300:	learn: 0.2305092	test: 0.2303296	best: 0.2303296 (300)	total: 7.35s	remaining: 3m 56s
400:	learn: 0.2270112	test: 0.2272204	best: 0.2272204 (400)	total: 9.62s	remaining: 3m 50s
500:	learn: 0.2246597	test: 0.2252942	best: 0.2252942 (500)	total: 11.9s	remaining: 3m 45s
600:	learn: 0.2229248	test: 0.2239520	best: 0.2239520 (600)	total: 14.3s	remaining: 3m 43s
700:	learn: 0.2214586	test: 0.2228890	best: 0.2228890 (700)	total: 16.9s	remaining: 3m 43s
800:	learn: 0.2202492	test: 0.2221051	best: 0.2221051 (800)	total: 19.1s	remaining: 3m 39s
900:	learn: 0.2191711	test: 0.2214416	best: 0.2214416 (900)	total: 21.3s	remaining: 3m 35s
1000:	learn: 0.2182562	test: 0.2209431	best: 0.2209431 (1000)	to

In [10]:
oof_cat = pd.read_parquet('../input/amex-team-mizo/train_feature_preprocessed.parquet', columns=['customer_ID']).drop_duplicates()
oof_cat['customer_ID_hash'] = oof_cat['customer_ID'].apply(lambda x: int(x[-16:],16) ).astype('int64')
oof_cat = oof_cat.set_index('customer_ID_hash')
oof_cat = cudf.from_pandas(oof_cat)
oof_cat = oof_cat.merge(oof, left_index=True, right_index=True)
oof_cat = oof_cat.sort_index().reset_index(drop=True)
oof_cat.to_csv(f'oof_cat.csv',index=False)
oof_cat.head()

Unnamed: 0,customer_ID,target,prediction
0,20eac26171c3d251c55fc78204e59fab1c15fc2bc96d0c...,1,0.763292
1,aea50fdf9b974ccec95fa177c3225a0f913483b457de6e...,0,0.000724
2,32cd2d41aef737b69089882754395925c96eaee1f4a859...,0,0.001136
3,8daa6d5dc2655a8a437531e6b8b96829113cdfe9bf6cae...,0,0.02155
4,0ceba351a3851202542feb49d7385bcef32f6037fc57c7...,1,0.883265


# Submission

sub = pd.read_pickle(TEST_PATH)[['customer_ID']]
sub['prediction'] = y_test
# DISPLAY PREDICTIONS
sub.to_csv(f'submission_cat.csv',index=False)
print('Submission file shape is', sub.shape )
sub.head()