In [63]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, AdaBoostClassifier, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
# Import decision tree shiz

np.random.seed(69);

In [64]:
# Data Preprocessing

In [65]:
train_df = pd.read_csv('train.csv')
train_df.head(5)

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,6,14,6,6,1,1,10,2,1
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,6,11,1,6,1,4,4,1,13,0
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,4,21,12,1,16,10,4,9,0
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,13,12,2,4,2,7,1,2,11,1
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,12,2,2,4,1,3,1,11,15,1


In [66]:
# Get train and test data as np arrays
X0, Y0 = train_df.values[:, :-1], train_df.values[:, -1]
(N, d) = X0.shape
X0.shape, Y0.shape

((592380, 27), (592380,))

In [67]:
# Only columns 3 and 4 (opened_position_qty and closed_position_qty) have NaN's
# Have to decide how to handle NaN's at some point
print(f'Cols where train data is Nan: {np.where(np.any(np.isnan(X0), axis=0))[0]}')

Cols where train data is Nan: [3 4]


In [68]:
def get_pars_for_processing(X):
    keep_cols = np.all(~np.isnan(X), axis=0) # Drop NaN columns
    keep_cols[0] = False # Don't keep id
    scaler = StandardScaler()
    scaler.fit(X[:, keep_cols])
    return scaler, keep_cols

def process_with_pars(X, params):
    '''
    Function that takes training / test data, 
    and process it for training / evaluation
    '''
    scaler, keep_cols = params
    return scaler.transform(X[:, keep_cols])

# Do Cross Validation

In [7]:
k = 5
accs = np.empty(k)
kf = KFold(n_splits=k, shuffle=True)

for i, (train_inds, val_inds) in enumerate(kf.split(X0)):
    X, Y = X0[train_inds], Y0[train_inds]
    valX, valY = X0[val_inds], Y0[val_inds]
    pars = get_pars_for_processing(X)
    X, valX = process_with_pars(X, pars), process_with_pars(valX, pars)

    clf = RandomForestRegressor(n_estimators=10)
    clf.fit(X, Y)
    accs[i] = np.sum(np.round(clf.predict(valX)) == valY) / len(valY)
    
acc = np.mean(accs)
acc

0.6388399338262603

In [69]:
val_size = N // 10
inds = np.random.permutation(N)
X, valX = X0[inds[:-val_size]], X0[inds[-val_size:]]
Y, valY = Y0[inds[:-val_size]], Y0[inds[-val_size:]]
pars = get_pars_for_processing(X)
X, valX = process_with_pars(X, pars), process_with_pars(valX, pars)

# clf = RandomForestRegressor(n_estimators=20, max_depth = 10)
clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5), n_estimators = 100)
clf.fit(X, Y)
print(clf.feature_importances_)
print(np.sum(np.round(clf.predict(valX)) == valY) / len(valY))

[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan]
0.6479793375873595


  / norm)


# Train final Model

In [26]:
pars = get_pars_for_processing(X0)
process = lambda X : process_with_pars(X, pars)
X = process(X0)

clf = AdaBoostRegressor(DecisionTreeRegressor(max_depth = 5), n_estimators = 100)
clf.fit(X, Y0);

# Save Final Model

In [27]:
mName = 'test_model.pkl'
with open(mName, 'wb') as f:
    pickle.dump(clf, f)

In [28]:
mName = 'test_model.pkl'
with open(mName, 'rb') as f:
    clf = pickle.load(f)

# Get Predictions on Test Set

In [29]:
test_df = pd.read_csv('test.csv')
tX = process(test_df.values)

output = clf.predict(tX)
 
output_df = pd.DataFrame({'id':test_df['id'], 'Predicted': output})
output_df.to_csv('submission.csv', index=False)