In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from tqdm import tqdm
import copy

import seaborn as sns
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import xgboost as xgb

In [2]:
z = ZipFile("../caltech-cs155-2020.zip")

dfs = {text_file.filename: pd.read_csv(z.open(text_file.filename))
       for text_file in z.infolist()
       if text_file.filename.endswith('.csv')}
dfs.keys()

dict_keys(['sample_submission.csv', 'test.csv', 'train.csv'])

In [3]:
df_train = dfs['train.csv']
df_test = dfs['test.csv']
df_train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [4]:
# Remove all rows with NaN to have it cleaner
array_train = df_train.to_numpy()
array_no_NaN = array_train[~np.isnan(array_train).any(axis = 1)]
array_no_NaN

array([[1.00000e+00, 3.84280e+03, 3.84340e+03, ..., 1.00000e+00,
        1.30000e+01, 0.00000e+00],
       [2.00000e+00, 3.84400e+03, 3.84430e+03, ..., 4.00000e+00,
        9.00000e+00, 0.00000e+00],
       [3.00000e+00, 3.84380e+03, 3.84340e+03, ..., 2.00000e+00,
        1.10000e+01, 1.00000e+00],
       ...,
       [5.92374e+05, 4.10940e+03, 4.10980e+03, ..., 1.00000e+01,
        7.00000e+00, 1.00000e+00],
       [5.92375e+05, 4.11020e+03, 4.11030e+03, ..., 7.00000e+00,
        7.00000e+00, 1.00000e+00],
       [5.92376e+05, 4.10940e+03, 4.11050e+03, ..., 7.00000e+00,
        5.00000e+00, 0.00000e+00]])

In [5]:
# Normalize feature vectors
norm_train = np.empty_like(array_train)

for i in range(28):
    col_max = max(array_no_NaN[:, i])
    col_min = 0
    norm_train[:, i] = (array_train[:, i] - col_min) / (col_max - col_min)

In [6]:
array_train = df_train.to_numpy()
# Make a copy of the column, so we don't change the original
vector_3 = copy.deepcopy(norm_train[:,3])
vector_4 = copy.deepcopy(norm_train[:,4])

# Train the first imputer
vector_3 = np.reshape(vector_3, (-1,1))
imp_v3 = IterativeImputer(max_iter=100, random_state=0)
imp_v3.fit(vector_3)
vector_3_trafo = imp_v3.transform(vector_3)
vector_3_trafo = vector_3_trafo.flatten()

# Train the second imputer
vector_4 = np.reshape(vector_4, (-1,1))
imp_v4 = IterativeImputer(max_iter=100, random_state=0)
imp_v4.fit(vector_4)
vector_4_trafo = imp_v4.transform(vector_4)
vector_4_trafo = vector_4_trafo.flatten()

# Put back into the normalized array
transform_train = copy.deepcopy(norm_train)
transform_train[:,3] = vector_3_trafo
transform_train[:,4] = vector_4_trafo

In [7]:
def trafo_to_df(array, df):
    key_list = list(df.keys())
    key_list.pop()
    df_dict = {}
    
    for i in range(27):
        temp = {key_list[i] : array[:,i]}
        df_dict.update(temp)
        
    new_df = pd.DataFrame(data = df_dict)
    
    return new_df

In [8]:
df_train = trafo_to_df(transform_train[:,:-1], df_train)
df_test_dict = {'y' : transform_train[:,-1]}
df_train_y = pd.DataFrame(data = df_test_dict)
df_train.head()

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid1vol,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol
0,0.0,0.933573,0.933576,0.017507,0.009265,0.407115,0.0,0.933573,0.933521,0.933609,...,0.072727,0.007194,0.044118,0.117647,0.05,0.046154,0.007576,0.007519,0.074627,0.014815
1,2e-06,0.93367,0.933771,0.075,0.231132,0.217391,-0.86,0.933719,0.933716,0.933755,...,0.063636,0.043165,0.080882,0.008403,0.05,0.007692,0.030303,0.030075,0.007463,0.096296
2,3e-06,0.933962,0.933989,0.0875,0.363208,0.332016,-1.38,0.933913,0.93391,0.933949,...,0.027273,0.007194,0.029412,0.176471,0.1,0.007692,0.121212,0.075188,0.029851,0.066667
3,5e-06,0.933913,0.933771,0.0375,0.160377,0.146245,-0.6,0.933719,0.933716,0.933755,...,0.090909,0.093525,0.088235,0.016807,0.033333,0.015385,0.05303,0.007519,0.014925,0.081481
4,7e-06,0.933767,0.933698,0.0375,0.179245,0.162055,-0.7,0.93367,0.933618,0.933657,...,0.127273,0.086331,0.014706,0.016807,0.033333,0.007692,0.022727,0.007519,0.08209,0.111111


In [9]:
#splitting into training and validation sets 
X_train, X_validate, y_train, y_validate = train_test_split(df_train, df_train_y, test_size=0.3, shuffle = False)

In [10]:
xgb3 = xgb.XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=2,
    min_child_weight=1,
    gamma=0.0,
    subsample=0.6,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    seed=28
)

eval_set = [(X_train, y_train), (X_validate, y_validate)]


In [11]:
xgb3.fit(X_train, y_train, eval_metric ='auc',eval_set=eval_set, early_stopping_rounds=10)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[0]	validation_0-auc:0.609272	validation_1-auc:0.600676
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.621907	validation_1-auc:0.612379
[2]	validation_0-auc:0.625179	validation_1-auc:0.61549
[3]	validation_0-auc:0.629316	validation_1-auc:0.619372
[4]	validation_0-auc:0.63185	validation_1-auc:0.622059
[5]	validation_0-auc:0.632793	validation_1-auc:0.6232
[6]	validation_0-auc:0.634091	validation_1-auc:0.624233
[7]	validation_0-auc:0.634067	validation_1-auc:0.623894
[8]	validation_0-auc:0.634121	validation_1-auc:0.624131
[9]	validation_0-auc:0.634283	validation_1-auc:0.624161
[10]	validation_0-auc:0.634837	validation_1-auc:0.62474
[11]	validation_0-auc:0.63519	validation_1-auc:0.625092
[12]	validation_0-auc:0.635547	validation_1-auc:0.625457
[13]	validation_0-auc:0.636354	validation_1-auc:0.626143
[14]	validation_0-auc:0.636908	validation_1-auc:0.626692
[15]

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0.0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=1000, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=28,
              silent=None, subsample=0.6, verbosity=1)

In [12]:
#predict on validation set
ypred_xgb3 = xgb3.predict_proba(X_validate)[:,1]

In [13]:
#calculate AUC on validation set
roc = sklearn.metrics.roc_auc_score(y_validate, ypred_xgb3)
print("AUC: %.4f%% " % (roc * 100))

AUC: 63.4726% 


In [14]:
#predict on test set
predictions = xgb3.predict_proba(df_test)

# Kaggle needs the submission to have a certain format;
submission = pd.DataFrame({ 'id': df_test.index,
                            'Predicted': predictions[:,1]})

In [15]:
#is the format correct?
submission.head()

Unnamed: 0,id,Predicted
0,0,0.505949
1,1,0.415898
2,2,0.415898
3,3,0.508599
4,4,0.287998


In [16]:
submission.to_csv("submission_vector.csv", index=False)