In [1]:
!pip install xgboost
!pip install pandas
!pip install numpy
!pip install lightgbm
!pip install catboost
!pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor as cat
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.metrics import root_mean_squared_error

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [3]:
df_train = pd.read_csv('data/train_data.csv', low_memory=False)
df_train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,first_active,year_of_first_active,quarter_of_first_active,month_of_first_active,...,purchase_on_weekend_mode,purchase_hour_mean,purchase_hour_min,purchase_hour_max,purchase_hour_nunique,purchase_hour_mode,purchase_time_of_day_nunique,purchase_time_of_day_mode,purchase_during_holiday_nunique,purchase_during_holiday_mode
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283,2017-06-01,2017,2,6,...,False,13.176871,0,23,23,14,4,Afternoon,1,False
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913,2017-01-01,2017,1,1,...,False,14.757412,0,23,24,12,4,Afternoon,1,False
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056,2016-08-01,2016,3,8,...,False,17.955556,8,23,14,19,4,Evening,1,False
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495,2017-09-01,2017,3,9,...,False,14.572917,0,23,17,15,4,Afternoon,1,False
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749,2017-11-01,2017,4,11,...,False,13.530726,0,23,22,11,4,Afternoon,1,False


In [4]:
for feature in df_train.columns.tolist():
    if df_train[feature].dtype == 'int64':
        df_train[feature] = df_train[feature].astype('int16')
    elif df_train[feature].dtype == 'float64':
        df_train[feature] = df_train[feature].astype('float32')
    else:
        df_train[feature] = df_train[feature].astype('category')

df_train.columns.tolist()

['first_active_month',
 'card_id',
 'feature_1',
 'feature_2',
 'feature_3',
 'target',
 'first_active',
 'year_of_first_active',
 'quarter_of_first_active',
 'month_of_first_active',
 'active_days',
 'authorized_flag_nunique_x',
 'authorized_flag_mode_x',
 'card_id_nunique_x',
 'installments_sum_x',
 'installments_mean_x',
 'installments_min_x',
 'installments_max_x',
 'installments_var_x',
 'installments_std_x',
 'installments_nunique_x',
 'installments_mode_x',
 'category_3_nunique_x',
 'category_3_mode_x',
 'merchant_id_nunique_x',
 'merchant_id_mode_x',
 'month_lag_sum_x',
 'month_lag_mean_x',
 'month_lag_min_x',
 'month_lag_max_x',
 'month_lag_var_x',
 'month_lag_std_x',
 'month_lag_nunique_x',
 'month_lag_mode_x',
 'purchase_amount_sum_x',
 'purchase_amount_mean_x',
 'purchase_amount_min_x',
 'purchase_amount_max_x',
 'purchase_amount_var_x',
 'purchase_amount_std_x',
 'purchase_amount_nunique_x',
 'purchase_amount_mode_x',
 'merchant_group_id_nunique_x',
 'merchant_group_id_mod

In [5]:
X = df_train.drop(columns=['card_id','target'], axis=1)
y = df_train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

### XGBoost

In [6]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', 
                         enable_categorical=True,
                         eval_metric='rmse',
                         n_estimators=100,
                         early_stopping_rounds=10,
                         random_state=42)

xgb_model.fit(X_train, y_train, verbose=False, eval_set=[(X_val, y_val)])
xgb_pred = xgb_model.predict(X_val)
xgb_rmse = root_mean_squared_error(y_val, xgb_pred)

print(f'RMSE on Val: {xgb_rmse}')

booster = xgb_model.get_booster()
importance_type = "gain"
feature_importance_dict = booster.get_score(importance_type=importance_type)

# Convert to a DataFrame for plotting
importance_df = pd.DataFrame(
    feature_importance_dict.items(), columns=["Feature", "Importance"]
).sort_values(by="Importance", ascending=False)

importance_df

RMSE on Val: 3.8932440280914307


Unnamed: 0,Feature,Importance
41,days_since_purchase_max_y,17783.251953
45,merchant_id_mode,13740.938477
39,avg_sales_lag3_var_y,12953.607422
35,merchant_id_mode_y,10286.734375
37,purchase_amount_max_y,7786.691406
7,merchant_id_mode_x,5602.385254
25,days_since_purchase_min_x,5509.007324
52,purchase_week_of_year_nunique,5246.651367
50,days_since_purchase_min,5140.391602
36,month_lag_min_y,4531.645508


In [7]:
xgb_model2 = xgb.XGBRegressor(objective='reg:squarederror', 
                         enable_categorical=True,
                         eval_metric='rmse',
                         n_estimators=100,
                         early_stopping_rounds=10,
                         random_state=42)

# Greedy Search for best feature
include = [f for f in X.columns.tolist()]
best_features = []
best_rmse = 10
best_model = None
best_pred = None

while include:
    best_feature = ''
    
    for feat in include:
        xgb_model2.fit(X_train[best_features+[feat]], y_train, verbose=False, eval_set=[(X_val[best_features+[feat]], y_val)])
        y_pred = xgb_model2.predict(X_val[best_features+[feat]])
        
        rmse = root_mean_squared_error(y_val, y_pred)
    
        if rmse < best_rmse:
            best_rmse = rmse
            best_feature = feat
            best_model = xgb_model2
            best_pred = y_pred

    if best_feature == '':
        print('No better features found')
        break
        
    best_features.append(best_feature)
    include.remove(best_feature)
    
    print(f'Best selected: {best_feature}: {best_rmse}')

print(f'Features selected: {best_features}')

Best selected: days_since_purchase_max_y: 3.8279342651367188
Best selected: month_lag_nunique: 3.8091516494750977
Best selected: days_since_purchase_min: 3.7853448390960693
Best selected: installments_sum: 3.751058578491211
Best selected: purchase_quarter_min: 3.7439029216766357
Best selected: purchase_amount_max_y: 3.741180181503296
Best selected: month_lag_max: 3.7319743633270264
Best selected: purchase_amount_mean: 3.7232236862182617
Best selected: avg_sales_lag12_sum: 3.720961809158325
Best selected: state_id_nunique_y: 3.7168679237365723
Best selected: purchase_quarter_max: 3.7133147716522217
Best selected: active_months_lag3_max_y: 3.7133145332336426
No better features found
Features selected: ['days_since_purchase_max_y', 'month_lag_nunique', 'days_since_purchase_min', 'installments_sum', 'purchase_quarter_min', 'purchase_amount_max_y', 'month_lag_max', 'purchase_amount_mean', 'avg_sales_lag12_sum', 'state_id_nunique_y', 'purchase_quarter_max', 'active_months_lag3_max_y']


### LGBM

In [8]:
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'max_depth': -1,
    'verbose': 0,
    'early_stopping_round': 10
}

lgb_model = lgb.train(params, train_data, valid_sets=[val_data])
lgb_pred = lgb_model.predict(X_val)
lgb_rmse = root_mean_squared_error(y_val, lgb_pred)

print(f'\nRMSE on Val Set: {lgb_rmse}')

feature_importances = lgb_model.feature_importance(importance_type='split')  # 'split' or 'gain'
feature_names = lgb_model.feature_name()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

importance_df


RMSE on Val Set: 3.7343982052660802


Unnamed: 0,feature,importance
23,merchant_id_mode_x,125
211,merchant_id_mode_y,105
342,days_since_purchase_min_y,70
399,merchant_id_mode,67
0,first_active_month,60
...,...,...
387,card_id_nunique,0
173,purchase_month_max_x,0
391,installments_max,0
393,installments_std,0


### CatBoost

In [9]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
import pandas as pd

# Specify categorical features
categorical_features = X_train.select_dtypes(include='category').columns.tolist()
print(categorical_features)

# Prepare data
train_data = Pool(data=X_train, label=y_train, cat_features=categorical_features)
val_data = Pool(data=X_val, label=y_val, cat_features=categorical_features)

# Define parameters
params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 8,
    'loss_function': 'RMSE',
    'verbose': 100,
    'early_stopping_rounds': 100
}

# Train CatBoost model
cat_model = CatBoostRegressor(**params)
cat_model.fit(train_data, eval_set=val_data, use_best_model=True)

# Make predictions
cat_pred = cat_model.predict(X_val)
cat_rmse = mean_squared_error(y_val, cat_pred, squared=False)  # RMSE

print(f'\nRMSE on Val Set: {cat_rmse}')

# Feature importance
feature_importances = cat_model.get_feature_importance(type='FeatureImportance')
feature_names = X_train.columns  # Replace with your feature names if not a DataFrame

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importances
}).sort_values(by='importance', ascending=False)

print(importance_df)



['first_active_month', 'first_active', 'authorized_flag_mode_x', 'category_3_mode_x', 'merchant_id_mode_x', 'category_1_mode_x', 'most_recent_sales_range_mode_x', 'most_recent_purchases_range_mode_x', 'category_4_mode_x', 'purchase_on_weekend_mode_x', 'purchase_time_of_day_mode_x', 'purchase_during_holiday_mode_x', 'authorized_flag_mode_y', 'category_3_mode_y', 'merchant_id_mode_y', 'category_1_mode_y', 'most_recent_sales_range_mode_y', 'most_recent_purchases_range_mode_y', 'category_4_mode_y', 'purchase_on_weekend_mode_y', 'purchase_time_of_day_mode_y', 'purchase_during_holiday_mode_y', 'authorized_flag_mode', 'category_3_mode', 'merchant_id_mode', 'category_1_mode', 'most_recent_sales_range_mode', 'most_recent_purchases_range_mode', 'category_4_mode', 'purchase_on_weekend_mode', 'purchase_time_of_day_mode', 'purchase_during_holiday_mode']
0:	learn: 3.8323282	test: 3.8652803	best: 3.8652803 (0)	total: 420ms	remaining: 7m
100:	learn: 3.6149981	test: 3.7201936	best: 3.7201936 (100)	tota



In [10]:
pred = np.mean([best_pred, lgb_pred, cat_pred], axis=0)
rmse = root_mean_squared_error(y_val, pred)
print(f'RMSE on Val: {rmse}')

X_val['xgb'] = best_pred
X_val['lgb'] = lgb_pred
X_val['pred_mean'] = pred
X_val['cat'] = cat_pred
X_val.head()

RMSE on Val: 3.698377451684167


  X_val['xgb'] = best_pred
  X_val['lgb'] = lgb_pred
  X_val['pred_mean'] = pred
  X_val['cat'] = cat_pred


Unnamed: 0,first_active_month,feature_1,feature_2,feature_3,first_active,year_of_first_active,quarter_of_first_active,month_of_first_active,active_days,authorized_flag_nunique_x,...,purchase_hour_nunique,purchase_hour_mode,purchase_time_of_day_nunique,purchase_time_of_day_mode,purchase_during_holiday_nunique,purchase_during_holiday_mode,xgb,lgb,pred_mean,cat
165789,2015-08,3,1,1,2015-08-01,2015,3,8,915,2,...,12,11,3,Afternoon,1,False,-0.40852,0.008261,0.004431,0.413553
43663,2017-12,2,1,0,2017-12-01,2017,4,12,62,1,...,15,15,4,Afternoon,1,False,-0.132362,-0.030541,0.004384,0.176055
201089,2015-01,3,3,1,2015-01-01,2015,1,1,1127,2,...,17,12,4,Afternoon,1,False,-0.093767,-0.123224,-0.152097,-0.2393
107580,2017-08,1,1,0,2017-08-01,2017,3,8,184,2,...,15,11,4,Evening,1,False,-0.09952,-0.327525,-0.119178,0.069511
163887,2016-05,5,1,1,2016-05-01,2016,2,5,641,2,...,20,12,4,Afternoon,1,False,-0.424292,-0.358561,-0.345512,-0.253681


In [11]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_val[['xgb','lgb','cat','pred_mean']], y_val, test_size=0.2, random_state=42)

In [12]:
"""# Define the model
input_layer = Input(shape=(4,), name="InputLayer")
x = Dense(64, activation="relu")(input_layer)
x = Dense(32, activation="relu")(x)
output_layer = Dense(1, activation="linear")(x)  # Single output for regression

# Create the model
meta_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
meta_model.compile(optimizer="adam", loss="mse")

# Train the model
meta_model.fit(X_train, y_train, epochs=30, batch_size=32)

# Predict the output on the test set
y_pred = meta_model.predict(X_test)

# Calculate the RMSE (Root Mean Squared Error)
rmse = root_mean_squared_error(y_test, y_pred)

print(f"RMSE on the val set: {rmse}")"""

'# Define the model\ninput_layer = Input(shape=(4,), name="InputLayer")\nx = Dense(64, activation="relu")(input_layer)\nx = Dense(32, activation="relu")(x)\noutput_layer = Dense(1, activation="linear")(x)  # Single output for regression\n\n# Create the model\nmeta_model = Model(inputs=input_layer, outputs=output_layer)\n\n# Compile the model\nmeta_model.compile(optimizer="adam", loss="mse")\n\n# Train the model\nmeta_model.fit(X_train, y_train, epochs=30, batch_size=32)\n\n# Predict the output on the test set\ny_pred = meta_model.predict(X_test)\n\n# Calculate the RMSE (Root Mean Squared Error)\nrmse = root_mean_squared_error(y_test, y_pred)\n\nprint(f"RMSE on the val set: {rmse}")'

In [13]:
"""from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Attention, Concatenate, Reshape, Flatten
from sklearn.metrics import mean_squared_error
import numpy as np

# Custom RMSE Metric
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Input Layer
input_layer = Input(shape=(4,), name="InputLayer")

# Reshape input to 3D (batch_size, timesteps, features) for Attention
reshaped_input = Reshape((1, 4))(input_layer)  # 1 timestep, 4 features

# Attention Mechanism
query = Dense(64, activation="relu", name="QueryLayer")(reshaped_input)
key = Dense(64, activation="relu", name="KeyLayer")(reshaped_input)
value = Dense(64, activation="relu", name="ValueLayer")(reshaped_input)
attention_output = Attention(name="AttentionLayer")([query, key, value])

# Concatenate Attention Output with Original Input
concatenated = Concatenate(name="ConcatLayer")([attention_output, reshaped_input])

# Flatten the concatenated output
flattened = Flatten()(concatenated)

# Add Dense Layers with Dropout and Batch Normalization
x = Dense(128, activation="relu")(flattened)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)  # Dropout with 30% rate
x = Dense(64, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.3)(x)
x = Dense(32, activation="relu")(x)

# Output Layer
output_layer = Dense(1, activation="linear")(x)  # Single output for regression

# Create the Model
meta_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the Model
meta_model.compile(optimizer="adam", loss="mse")

# Model Summary
meta_model.summary()

# Train the Model
meta_model.fit(X_train, y_train, epochs=30, batch_size=32)

# Predict the Output on the Test Set
y_pred = meta_model.predict(X_test)

# Calculate the RMSE (Root Mean Squared Error)
rmse = root_mean_squared_error(y_test, y_pred)

print(f"RMSE on the Test Set: {rmse}")"""



'from tensorflow.keras.models import Model\nfrom tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Attention, Concatenate, Reshape, Flatten\nfrom sklearn.metrics import mean_squared_error\nimport numpy as np\n\n# Custom RMSE Metric\ndef root_mean_squared_error(y_true, y_pred):\n    return np.sqrt(mean_squared_error(y_true, y_pred))\n\n# Input Layer\ninput_layer = Input(shape=(4,), name="InputLayer")\n\n# Reshape input to 3D (batch_size, timesteps, features) for Attention\nreshaped_input = Reshape((1, 4))(input_layer)  # 1 timestep, 4 features\n\n# Attention Mechanism\nquery = Dense(64, activation="relu", name="QueryLayer")(reshaped_input)\nkey = Dense(64, activation="relu", name="KeyLayer")(reshaped_input)\nvalue = Dense(64, activation="relu", name="ValueLayer")(reshaped_input)\nattention_output = Attention(name="AttentionLayer")([query, key, value])\n\n# Concatenate Attention Output with Original Input\nconcatenated = Concatenate(name="ConcatLayer")([attention_

In [79]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Dense, Dropout, BatchNormalization, MultiHeadAttention, Concatenate, Reshape, Flatten
)
from sklearn.metrics import mean_squared_error
import numpy as np

# Custom RMSE Metric
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Input Layer
input_layer = Input(shape=(4,), name="InputLayer")

# Reshape input to 3D (batch_size, timesteps, features) for Multi-Head Attention
reshaped_input = Reshape((1, 4))(input_layer)  # 1 timestep, 4 features

# Multi-Head Attention Mechanism
# Key, Query, Value are derived from reshaped_input
multi_head_attention = MultiHeadAttention(num_heads=4, key_dim=64, name="MultiHeadAttention")
attention_output = multi_head_attention(query=reshaped_input, value=reshaped_input, key=reshaped_input)

# Concatenate Attention Output with Original Input
concatenated = Concatenate(name="ConcatLayer")([attention_output, reshaped_input])

# Flatten the concatenated output
flattened = Flatten()(concatenated)

# Add Dense Layers with Dropout and Batch Normalization
x = Dense(128, activation="relu")(flattened)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)  # Dropout with 20% rate
x = Dense(64, activation="relu")(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(32, activation="relu")(x)

# Additional Layers
x = Dense(64, activation="relu")(x)  # Additional Dense Layer
x = BatchNormalization()(x)  # Batch Normalization for added layer
x = Dropout(0.2)(x)  # Dropout with 20% rate

x = Dense(16, activation="relu")(x)  # Another additional Dense Layer
x = BatchNormalization()(x)
x = Dropout(0.2)(x)

# Output Layer
output_layer = Dense(1, activation="linear")(x)  # Single output for regression

# Create the Model
meta_model = Model(inputs=input_layer, outputs=output_layer)

# Compile the Model
meta_model.compile(optimizer="adam", loss="mse")

# Model Summary
meta_model.summary()

# Train the Model
meta_model.fit(X_train, y_train, epochs=50, batch_size=64)

# Predict the Output on the Test Set
y_pred = meta_model.predict(X_test)

# Calculate the RMSE (Root Mean Squared Error)
rmse = root_mean_squared_error(y_test, y_pred)

print(f"RMSE on the Test Set: {rmse}")


Epoch 1/50




[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 14.7874
Epoch 2/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.9241
Epoch 3/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.5465
Epoch 4/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.5334
Epoch 5/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.9421
Epoch 6/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 14.3703
Epoch 7/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.8971
Epoch 8/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 15.2928
Epoch 9/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 13.8861
Epoch 10/50
[1m505/505[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss



[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
RMSE on the Test Set: 3.7029199600219727




### Test Set

In [80]:
df_test = pd.read_csv('data/test_data.csv', low_memory=False)
df_test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,first_active,year_of_first_active,quarter_of_first_active,month_of_first_active,active_days,...,purchase_on_weekend_mode,purchase_hour_mean,purchase_hour_min,purchase_hour_max,purchase_hour_nunique,purchase_hour_mode,purchase_time_of_day_nunique,purchase_time_of_day_mode,purchase_during_holiday_nunique,purchase_during_holiday_mode
0,2017-04,C_ID_0ab67a22ab,3,3,1,2017-04-01,2017,2,4,306,...,False,14.337838,0,23,17,22,4,Night,1,False
1,2017-01,C_ID_130fd0cbdd,2,3,0,2017-01-01,2017,1,1,396,...,False,14.568182,0,23,18,11,4,Afternoon,1,False
2,2017-08,C_ID_b709037bc5,5,1,1,2017-08-01,2017,3,8,184,...,False,16.533333,13,20,7,14,2,Evening,1,False
3,2017-12,C_ID_d27d835a9f,2,1,0,2017-12-01,2017,4,12,62,...,False,18.0,8,22,11,21,4,Evening,1,False
4,2015-12,C_ID_2b5e3df5c2,5,1,1,2015-12-01,2015,4,12,793,...,False,14.210084,0,23,22,15,4,Afternoon,1,False


In [81]:
for feature in df_test.columns.tolist():
    if df_test[feature].dtype == 'int64':
        df_test[feature] = df_test[feature].astype('int16')
    elif df_test[feature].dtype == 'float64':
        df_test[feature] = df_test[feature].astype('float32')
    else:
        df_test[feature] = df_test[feature].astype('category')

df_test.columns

Index(['first_active_month', 'card_id', 'feature_1', 'feature_2', 'feature_3',
       'first_active', 'year_of_first_active', 'quarter_of_first_active',
       'month_of_first_active', 'active_days',
       ...
       'purchase_on_weekend_mode', 'purchase_hour_mean', 'purchase_hour_min',
       'purchase_hour_max', 'purchase_hour_nunique', 'purchase_hour_mode',
       'purchase_time_of_day_nunique', 'purchase_time_of_day_mode',
       'purchase_during_holiday_nunique', 'purchase_during_holiday_mode'],
      dtype='object', length=574)

In [82]:
df_test.dtypes

first_active_month                 category
card_id                            category
feature_1                             int16
feature_2                             int16
feature_3                             int16
                                     ...   
purchase_hour_mode                    int16
purchase_time_of_day_nunique          int16
purchase_time_of_day_mode          category
purchase_during_holiday_nunique       int16
purchase_during_holiday_mode       category
Length: 574, dtype: object

In [83]:
xgb_data3 = df_test[best_features+['purchase_during_holiday_mode']]
xgb_pred3 = best_model.predict(xgb_data3)

In [84]:
lgb_data2 = df_test.drop(columns=['card_id'], axis=1)
lgb_pred2 = lgb_model.predict(lgb_data2)

In [85]:
cat_data2 = df_test.drop(columns=['card_id'], axis=1)
test_pool = Pool(data=cat_data2, cat_features=categorical_features)
cat_pred2 = cat_model.predict(test_pool)

In [87]:
pred = np.mean([xgb_pred3, lgb_pred2, cat_pred2], axis=0)

df = pd.DataFrame()
df['xgb'] = xgb_pred3
df['lgb'] = lgb_pred2
df['cat'] = cat_pred2
df['mean'] = pred
df

Unnamed: 0,xgb,lgb,cat,mean
0,-0.615482,-0.958475,-1.790599,-1.121519
1,-0.313667,-0.144092,-0.225891,-0.227883
2,-1.561265,-0.851351,-1.508112,-1.306909
3,0.000134,-0.076248,-0.103656,-0.059923
4,-1.235940,-0.893994,-1.253594,-1.127842
...,...,...,...,...
123618,0.262617,0.195063,0.775515,0.411065
123619,-0.500095,-0.320422,-0.594967,-0.471828
123620,0.018136,0.067259,0.620989,0.235461
123621,-3.443068,-3.719795,-2.367840,-3.176901


In [88]:
test_set = df_test[['card_id']]
test_set['target'] = meta_model.predict(df)
test_set.head()

[1m3864/3864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 691us/step


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['target'] = meta_model.predict(df)


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-1.415512
1,C_ID_130fd0cbdd,-0.148409
2,C_ID_b709037bc5,-1.661893
3,C_ID_d27d835a9f,-0.035243
4,C_ID_2b5e3df5c2,-1.401163


In [89]:
test_set.to_csv('data/test_submission.csv', index=False)