# How to use pretrained models for prediction

## Checks

In [7]:
# Load modules
import numpy as np
import pandas as pd
import xgboost as xgb

# Test packages
print("Numpy version:",np.__version__)
print("Pandas verions:", pd.__version__)
print("XGBoost version:", xgb.__version__)


assert np.__version__ == '1.18.5'
assert pd.__version__ == '1.1.4'
assert xgb.__version__ == '1.2.1'

Numpy version: 1.18.5
Pandas verions: 1.1.4
XGBoost version: 1.2.1


In [8]:
from tqdm import tqdm
import janestreet

## Load saved model from file

In [10]:
import joblib
model = joblib.load("../models/xgboost_v0.joblib.dat")
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=11,
              min_child_weight=1, missing=-999, monotone_constraints='()',
              n_estimators=500, n_jobs=10, num_parallel_tree=1,
              random_state=2020, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.9, tree_method='hist', validate_parameters=1,
              verbosity=2)

In [27]:
# Define required parameters
NAN_VALUE = -999
FEATURES = ["feature_"+str(i) for i in range(0,129+1)]

## Submit using time-series API (from janestreet module)

In [13]:
env = janestreet.make_env()
iter_test = env.iter_test()

In [53]:
for (test_df, sample_prediction_df) in tqdm(iter_test):

    if test_df["weight"].item() >0:

        test_df.fillna(NAN_VALUE, inplace=True)
        preds = model.predict_proba(test_df[FEATURES])
        action = 1 if preds[0,1]>0.53 else 0        
    
    else:
        action = 0    
        
    assert type(action)==int
    assert action in [0,1]
    
    sample_prediction_df.action = action
    env.predict(sample_prediction_df)

15219it [03:01, 84.08it/s] 


In [54]:
nb_of_predictions, nb_of_passes

(23554, 6884)

## Sanity checks and Helpers

In [28]:
## Helpers
test = pd.read_csv("../input/jane-street-market-prediction/example_test.csv")
test.head()

Unnamed: 0,date,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id
0,0,0.0,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,...,,1.168391,8.313583,1.782433,14.018213,2.653056,12.600292,2.301488,11.445807,0
1,0,16.673515,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,...,,-1.17885,1.777472,-0.915458,2.831612,-1.41701,2.297459,-1.304614,1.898684,1
2,0,0.0,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,...,,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2
3,0,0.0,-1,1.174378,0.34464,0.066872,0.009357,-1.006373,-0.676458,,...,,2.838853,0.499251,3.033732,1.513488,4.397532,1.266037,3.856384,1.013469,3
4,0,0.138531,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,,...,,0.34485,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4


In [29]:
# Create an example instance of test_df (janestreet API)
test_df_temp = test.iloc[[150]].copy()
test_df_temp.drop("ts_id", axis=1, inplace = True)
test_df_temp

Unnamed: 0,date,weight,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_120,feature_121,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129
150,0,0.414218,1,-0.223407,-1.835836,-2.203604,-0.844399,-2.142812,-0.827847,,...,2.482895,-1.923482,1.196138,-2.499977,2.039261,-2.410264,2.54504,-1.955018,1.936977,-1.933912


### Check if test_df_temp has weight==0

In [44]:
test_df_temp["weight"].item() >0 

True

### Deal with NAs

In [34]:
print("Number of NAs before:", test_df_temp.isna().sum().sum())

test_df_temp.fillna(NAN_VALUE, inplace=True)
print("Number of NAs after:", test_df_temp.isna().sum().sum())

Number of NAs before: 14
Number of NAs after: 0


In [38]:
# Get predicion basd on trained ML model
preds = model.predict_proba(test_df_temp[FEATURES])
print("Predictions are:", preds)


[[0.40316224 0.59683776]]


In [39]:
# Generate action based on prediction
action = 1 if preds[0,1]>0.5 else 0

assert type(action)==int
assert action in [0,1]

print(action)

1


In [None]:
## Collect all the code here and test it for all dataset!

# Count the total number of predictions and passes
nb_of_predictions = 0
nb_of_passes = 0

for i in tqdm(range(len(test))):
    
    test_df_temp = test.iloc[[i]].copy()
    test_df_temp.drop("ts_id", axis=1, inplace = True)
    test_df_temp


    if test_df_temp["weight"].item() >0:

        test_df_temp.fillna(NAN_VALUE, inplace=True)
        preds = model.predict_proba(test_df_temp[FEATURES])
        action = 1 if preds[0,1]>0.5 else 0

        nb_of_predictions += 1
        
    else:
        action = 0
        
        nb_of_passes += 1
        
        
    assert type(action)==int
    assert action in [0,1]
    
    

In [52]:
# Test results
print((test["weight"]==0).value_counts())
nb_of_predictions, nb_of_passes

False    11777
True      3442
Name: weight, dtype: int64


(11777, 3442)