# XGBoost + Risk-based Weighted Predictions

**Trick:** Adjusted predictions based on classifier certainty

In [1]:
# Load modules
import numpy as np
import pandas as pd
import xgboost as xgb
import janestreet

In [15]:
import gc # garbage collection
import joblib

## Load dataset

In [2]:
# Load data
train = pd.read_csv('../../input/train.csv')
print(f'Done loading data. Train shape is {train.shape}')

Done loading data. Train shape is (2390491, 138)


In [5]:
train.head()

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,0.009916,0.014079,0.008773,0.00139,0.00627,1,-1.872746,-2.191242,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,0.025134,0.027607,0.033406,0.03438,0.02397,-1,0.81278,-0.256156,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-0.00473,-0.003273,-0.000461,-0.000476,-0.0032,-1,1.174378,0.34464,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [6]:
# For training only look at data that has non-zero weight
train = train[train.weight != 0]

## Define experiment settings

In [7]:
# Settings
NAN_VALUE = -999
FEATURES = [c for c in train.columns if 'feature' in c]
TARGET = 'resp'
MAX_WEIGHT = train.weight.max()

## Preprocessing 

In [8]:
# Split into X and y
X = train.loc[:, FEATURES].fillna(NAN_VALUE)

In [9]:
# Create targets
y = (train.loc[:, TARGET] > 0).astype(int)

In [10]:
# Clear memory
del train
gc.collect()

100

## Train model

In [12]:
# Parameters from: https://www.kaggle.com/hamditarek/market-prediction-xgboost-with-gpu-fit-in-1min
model = xgb.XGBClassifier(n_estimators=500,
                          max_depth=11,
                          learning_rate=0.05,
                          subsample=0.9,
                          colsample_bytree=0.7,
                          missing=NAN_VALUE,
                          random_state=2020,
                          tree_method='hist',
                          n_jobs = 10)
model.fit(X, y)
print('Finished training model')

Finished training model


In [16]:
# save model to file
joblib.dump(model, "../../models/xgboost_v0.joblib.dat")

['../../models/xgboost_v0.joblib.dat']

In [13]:
# Clear memory
del X, y
gc.collect()

87

## Predict


In [17]:
# Create submission using time-series API (from janestreet module)
env = janestreet.make_env()
iter_test = env.iter_test()

for (test_df, sample_prediction_df) in iter_test:    
    test_weight = test_df.iloc[0].weight
    if test_weight > 0:
        proba = model.predict_proba(test_df.loc[:, FEATURES].fillna(NAN_VALUE))[0, 1]
        sample_prediction_df.action = 1 if proba > 0.49 else 0
    else:
        sample_prediction_df.action = 0
    env.predict(sample_prediction_df)