In [None]:
# Library import
import numpy as np
import pandas as pd
import os
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score, accuracy_score, matthews_corrcoef
from sklearn.utils.class_weight import compute_class_weight
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Reading files
path = '/kaggle/input/amex-data/'
train = pd.read_parquet(f"{path}train_data.parquet", engine='pyarrow')
test = pd.read_parquet(f"{path}test_data.parquet", engine='pyarrow')
events = pd.read_parquet(f"{path}add_event.parquet", engine='pyarrow')
transactions = pd.read_parquet(f"{path}add_trans.parquet", engine='pyarrow')
offers = pd.read_parquet(f"{path}offer_metadata.parquet", engine='pyarrow')

In [3]:
train['id3'] = pd.to_numeric(train['id3'], errors='coerce').astype('Int64')
test['id3'] = pd.to_numeric(test['id3'], errors='coerce').astype('Int64')
events['id3'] = pd.to_numeric(events['id3'], errors='coerce').astype('Int64')
train['id4'] = pd.to_datetime(train['id4'], errors='coerce')
test['id4'] = pd.to_datetime(test['id4'], errors='coerce')

In [None]:
# Decoding the one-hot encoded columns
one_hot_cols = [f'f{i}' for i in range(226,310)]

train[one_hot_cols] = train[one_hot_cols].fillna(0)
test[one_hot_cols] = test[one_hot_cols].fillna(0)

train['compressed_info'] = train[one_hot_cols].values.argmax(axis=1)
train.drop(columns=one_hot_cols, inplace=True)

test['compressed_info'] = test[one_hot_cols].values.argmax(axis=1)
test.drop(columns=one_hot_cols, inplace=True)

In [None]:
# Feature created - 1
events['id4'] = pd.to_datetime(events['id4'], errors='coerce')
events['id7'] = pd.to_datetime(events['id7'], errors='coerce')
clicked = events.dropna(subset=['id7']).copy()
clicked['click_delay'] = (clicked['id7'] - clicked['id4']).dt.total_seconds()
clicked = clicked[clicked['click_delay'].notnull() & (clicked['click_delay'] >= 0)]
avg_click_delay = clicked.groupby('id3')['click_delay'].median().reset_index()
avg_click_delay.rename(columns={'click_delay': 'avg_click_delay_per_offer'}, inplace=True)
train = train.merge(avg_click_delay, on='id3', how='left')
test = test.merge(avg_click_delay, on='id3', how='left')

In [None]:
encoders = {}

# Feature 2: f378
non_null_mask = offers['f378'].notnull()
le_378 = LabelEncoder()
offers.loc[non_null_mask, 'f378_enc'] = le_378.fit_transform(offers.loc[non_null_mask, 'f378'])
encoders['f378'] = le_378
f378_mean_by_id3 = offers.groupby('id3')['f378_enc'].mean()
train['f378_enc_mean'] = train['id3'].map(f378_mean_by_id3)
test['f378_enc_mean'] = test['id3'].map(f378_mean_by_id3)

# Feature 3: f375
non_null_mask = offers['f375'].notnull()
le_375 = LabelEncoder()
offers.loc[non_null_mask, 'f375_enc'] = le_375.fit_transform(offers.loc[non_null_mask, 'f375'])
encoders['f375'] = le_375
f375_mean_by_id3 = offers.groupby('id3')['f375_enc'].mean()
train['f375_enc_mean'] = train['id3'].map(f375_mean_by_id3)
test['f375_enc_mean'] = test['id3'].map(f375_mean_by_id3)

# Feature 4: f376
non_null_mask = offers['f376'].notnull()
le_376 = LabelEncoder()
offers.loc[non_null_mask, 'f376_enc'] = le_376.fit_transform(offers.loc[non_null_mask, 'f376'])
encoders['f376'] = le_376
f376_mean_by_id3 = offers.groupby('id3')['f376_enc'].mean()
train['f376_enc_mean'] = train['id3'].map(f376_mean_by_id3)
test['f376_enc_mean'] = test['id3'].map(f376_mean_by_id3)

# Feature 5: f374
non_null_mask = offers['f374'].notnull()
le_374 = LabelEncoder()
offers.loc[non_null_mask, 'f374_enc'] = le_374.fit_transform(offers.loc[non_null_mask, 'f374'])
encoders['f374'] = le_374
f374_mean_by_id3 = offers.groupby('id3')['f374_enc'].mean()
train['f374_enc_mean'] = train['id3'].map(f374_mean_by_id3)
test['f374_enc_mean'] = test['id3'].map(f374_mean_by_id3)

# Feature 6: id8 (with top N filtering)
offers['id8'] = offers['id8'].astype(str)
top_9 = offers['id8'].value_counts().nlargest(4).index
offers['id8_mod'] = offers['id8'].where(offers['id8'].isin(top_9), '99999999')
le_id8 = LabelEncoder()
offers['id8_encoded'] = le_id8.fit_transform(offers['id8_mod'])
encoders['id8'] = le_id8
id8_avg_by_id3 = offers.groupby('id3')['id8_encoded'].mean()
train['id8_encoded_avg'] = train['id3'].map(id8_avg_by_id3)
test['id8_encoded_avg'] = test['id3'].map(id8_avg_by_id3)

In [None]:
# Feature 7
events_subset = events[['id2', 'id3', 'id4']].drop_duplicates(subset=['id3', 'id4'])
train = train.merge(events_subset, on=['id3', 'id4'], how='left', suffixes=('', '_from_events'))
test = test.merge(events_subset, on=['id3', 'id4'], how='left', suffixes=('', '_from_events'))

In [8]:
def safe_mode(series):
    mode_vals = series.mode()
    return mode_vals[0] if not mode_vals.empty else np.nan

# Aggregate transaction features grouped by id2
agg_transactions = transactions.groupby('id2').agg({
    'f367': ['mean', 'sum', 'count'],
    'f368': 'nunique',
    'f369': 'nunique'
}).reset_index()

# Rename flattened columns
agg_transactions.columns = ['id2', 'f367_mean', 'f367_sum', 'num_transactions',
                            'num_products', 'num_debit_credit_types']

# Rename id2 to avoid conflict during merge
agg_transactions.rename(columns={'id2': 'id2_transactions'}, inplace=True)

# Merge with train
train = train.merge(agg_transactions, left_on='id2_from_events', right_on='id2_transactions', how='left')
train.drop(columns='id2_transactions', inplace=True)

# Merge with test
test = test.merge(agg_transactions, left_on='id2_from_events', right_on='id2_transactions', how='left')
test.drop(columns='id2_transactions', inplace=True)

In [None]:
# Dropping columns with more than 80% missing values
threshold = 0.8
missing_ratio = train.isnull().mean()
columns_to_keep = missing_ratio[missing_ratio <= threshold].index
train = train[columns_to_keep]
test = test[columns_to_keep.drop('y', errors='ignore')]

In [10]:
y = train['y'].astype(int)
X = train.drop(columns=['y', 'id1', 'id2', 'id3', 'id4', 'id5', 'id8_encoded_avg', 'id2_from_events'])
X = X.apply(pd.to_numeric, errors='ignore')

In [None]:
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.difference(cat_cols).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    non_null_mask = X_train[col].notnull()
    le.fit(X_train.loc[non_null_mask, col])
    X_train.loc[non_null_mask, col] = le.transform(X_train.loc[non_null_mask, col])
    X_train[col] = X_train[col].astype(float)
    encoders[col] = le

for col in cat_cols:
    le = encoders[col]
    non_null_mask = X_test[col].notnull()
    X_test.loc[non_null_mask, col] = X_test.loc[non_null_mask, col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else np.nan)
    X_test[col] = X_test[col].astype(float)

for col in cat_cols:
    le = encoders[col]
    non_null_mask = test[col].notnull()
    test.loc[non_null_mask, col] = test.loc[non_null_mask, col].map(
        lambda x: le.transform([x])[0] if x in le.classes_ else np.nan)
    test[col] = test[col].astype(float)

In [None]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train[numerical_cols]), columns=numerical_cols, index=X_train.index)
X_test_imputed = pd.DataFrame(imputer.transform(X_test[numerical_cols]), columns=numerical_cols, index=X_test.index)
X_train[numerical_cols] = X_train_imputed
X_test[numerical_cols] = X_test_imputed
test[numerical_cols] = imputer.transform(test[numerical_cols])

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])
test[cat_cols] = cat_imputer.transform(test[cat_cols])

In [None]:
scaler_standard = MinMaxScaler()
X_train[numerical_cols] = scaler_standard.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler_standard.transform(X_test[numerical_cols])
test[numerical_cols] = scaler_standard.transform(test[numerical_cols])

In [16]:
X_train.shape

(577623, 255)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)
test = pca.transform(test)

In [18]:
X_train.shape

(577623, 22)

In [19]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))

models = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42, class_weights=class_weight_dict),
    "LightGBM": LGBMClassifier(class_weight=class_weight_dict, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, scale_pos_weight=class_weight_dict[1] / class_weight_dict[0], random_state=42),
    "Decision Tree": DecisionTreeClassifier(class_weight=class_weight_dict, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, class_weight=class_weight_dict, random_state=42),
    "Extra Trees": ExtraTreesClassifier(n_estimators=200, class_weight=class_weight_dict, random_state=42)
}

In [20]:
results = []

for name, model in models.items():
    print('='*40, name, ' model start', '='*40, sep='')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    report = classification_report(y_test, y_pred, output_dict=True)
    roc_auc = roc_auc_score(y_test, y_proba)
    pr_auc = average_precision_score(y_test, y_proba)
    acc = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "F1-Score": report["1"]["f1-score"],
        "Precision": report["1"]["precision"],
        "Recall": report["1"]["recall"],
        "ROC AUC": roc_auc,
        "PR AUC": pr_auc,
        "MCC": mcc
    })

    print('='*40, name, ' model end', '='*40, sep='')

results_initial = pd.DataFrame(results).sort_values(by="Precision", ascending=False).reset_index(drop=True)
print('='*40,'Ensemble model start', '='*40, sep='')
probas = {
    name: model.predict_proba(X_test)[:, 1] for name, model in models.items()
}

results_initial['custom_weight'] = results_initial['Precision'].where(results_initial['Precision'] >= 0.9, 0)
weights = (results_initial.set_index('Model')['custom_weight'] / results_initial['custom_weight'].sum()).to_dict()

ensemble_proba = sum(weights[name] * probas[name] for name in models)
ensemble_pred = (ensemble_proba >= 0.5).astype(int)
ensemble_report = classification_report(y_test, ensemble_pred, output_dict=True)
ensemble_roc_auc = roc_auc_score(y_test, ensemble_proba)
ensemble_pr_auc = average_precision_score(y_test, ensemble_proba)
ensemble_acc = accuracy_score(y_test, ensemble_pred)
ensemble_mcc = matthews_corrcoef(y_test, ensemble_pred)

results.append({
    "Model": "Ensemble",
    "Accuracy": ensemble_acc,
    "F1-Score": ensemble_report["1"]["f1-score"],
    "Precision": ensemble_report["1"]["precision"],
    "Recall": ensemble_report["1"]["recall"],
    "ROC AUC": ensemble_roc_auc,
    "PR AUC": ensemble_pr_auc,
    "MCC": ensemble_mcc
})
print('='*40,'Ensemble model end', '='*40, sep='')
results_df = pd.DataFrame(results).sort_values(by="Precision", ascending=False).reset_index(drop=True)

[LightGBM] [Info] Number of positive: 27788, number of negative: 549835
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.081171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5610
[LightGBM] [Info] Number of data points in the train set: 577623, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


In [21]:
results_df

Unnamed: 0,Model,Accuracy,F1-Score,Precision,Recall,ROC AUC,PR AUC,MCC
0,Ensemble,0.980617,0.750934,0.983395,0.607363,0.992392,0.920302,0.764822
1,Extra Trees,0.980352,0.746872,0.982225,0.602505,0.991021,0.91422,0.761178
2,Random Forest,0.980311,0.746811,0.979159,0.603584,0.991853,0.91183,0.760604
3,Decision Tree,0.974873,0.72957,0.756462,0.704523,0.84653,0.547161,0.71691
4,CatBoost,0.971731,0.764444,0.637966,0.953471,0.995648,0.943204,0.767204
5,XGBoost,0.953023,0.663317,0.506192,0.961891,0.993518,0.916792,0.679007
6,LightGBM,0.919456,0.538232,0.371613,0.97571,0.988784,0.857148,0.574809


In [22]:
weights

{'Extra Trees': 0.5007813868633098,
 'Random Forest': 0.49921861313669025,
 'Decision Tree': 0.0,
 'CatBoost': 0.0,
 'XGBoost': 0.0,
 'LightGBM': 0.0}

In [27]:
best_model_name = results_df.loc[0, 'Model']
print(f"Best model based on Precision: {best_model_name}")

if best_model_name == 'Ensemble':
    test_probas = {}
    for name, model in models.items():
        if name in weights:
            probas = model.predict_proba(test)
            if probas.ndim == 2:
                test_probas[name] = probas[:, 1] if probas.shape[1] > 1 else probas[:, 0]
            else:
                test_probas[name] = probas
    
    preds_proba = np.zeros(len(test))
    for name in test_probas:
        preds_proba += weights[name] * test_probas[name]
    
else:
    best_model = models[best_model_name]
    probas = best_model.predict_proba(test)
    if probas.ndim == 2:
        preds_proba = probas[:, 1] if probas.shape[1] > 1 else probas[:, 0]
    else:
        preds_proba = probas

if isinstance(preds_proba, (int, float)):
    preds_proba = np.full(len(test), preds_proba)
elif len(preds_proba) != len(test):
    raise ValueError(f"Prediction array length ({len(preds_proba)}) doesn't match test data length ({len(test)})")

preds_proba = np.asarray(preds_proba).flatten()
if isinstance(test, pd.DataFrame):
    output_df = test.copy()
    output_df['Predicted Probability'] = preds_proba
else:
    output_df = pd.DataFrame(test)
    output_df['Predicted Probability'] = preds_proba

test_df = pd.read_parquet(f"{path}test_data.parquet")
test_df['pred'] = output_df['Predicted Probability'].values
test_df = test_df[['id1', 'id2', 'id3', 'id5', 'pred']]

Best model based on Precision: Ensemble


In [28]:
test_df.head()

Unnamed: 0,id1,id2,id3,id5,pred
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04,0.074961
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04,0.157512
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05,0.382691
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04,0.095008
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05,0.225023


In [29]:
test_df.to_csv("/kaggle/working/r2_submission_PredictPulse.csv", index=False)