In [1]:
import pandas as pd
import numpy as np
import os

headers = ['Timestamp', 'Open', 'High', 'Low', 'Close', 'Volume(BTC)', 'Volume(Currency)', 'WeightedPrice']
train_X = pd.DataFrame(columns=headers)
test_X = pd.DataFrame(columns=headers)

for filename in os.listdir('../datasets/bitcoin-5')[:20]:
    train_X = pd.concat([train_X, pd.read_csv(f'../datasets/bitcoin-5/{filename}', index_col=False)])
    print("train: ", filename)
    
for filename in os.listdir('../datasets/bitcoin-5')[20:30]:
    test_X = pd.concat([test_X, pd.read_csv(f'../datasets/bitcoin-5/{filename}', index_col=False)])
    print("test: ", filename)

train:  2020-06-01.csv
train:  2020-06-02.csv
train:  2020-06-03.csv
train:  2020-06-04.csv
train:  2020-06-05.csv
train:  2020-06-06.csv
train:  2020-06-07.csv
train:  2020-06-08.csv
train:  2020-06-09.csv
train:  2020-06-10.csv
train:  2020-06-11.csv
train:  2020-06-12.csv
train:  2020-06-13.csv
train:  2020-06-14.csv
train:  2020-06-15.csv
train:  2020-06-16.csv
train:  2020-06-17.csv
train:  2020-06-18.csv
train:  2020-06-19.csv
train:  2020-06-20.csv
test:  2020-06-21.csv
test:  2020-06-22.csv
test:  2020-06-23.csv
test:  2020-06-24.csv
test:  2020-06-25.csv
test:  2020-06-26.csv
test:  2020-06-27.csv
test:  2020-06-28.csv
test:  2020-06-29.csv
test:  2020-06-30.csv


In [5]:
from palantir.preprocessor import preprocess

X_train, y_train = preprocess(train_X)
X_test, y_test = preprocess(test_X)

In [8]:
def test_clf(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    print("training: ", clf.score(X_train, y_train))
    print("testing: ", clf.score(X_test, y_test))

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

test_clf(GradientBoostingClassifier(), X_train, y_train.classifier, X_test, y_test.classifier)

training:  0.8885504389894284
testing:  0.8621330360460795


In [101]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'bootstrap': [True, False],
 'max_depth': [10,50,100, None],
 'max_features': ['auto', 'sqrt'],
 'n_estimators': [200]}
]

grid_search = GridSearchCV(RandomForestClassifier(), 
                          param_grid,
                          cv=5,
                          )

In [102]:
grid_search.fit(X_train, y_train.classifier)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [103]:
grid_search.best_estimator_

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [104]:
cvres = grid_search.cv_results_
for score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(score, params)

0.8455503572998591 {'bootstrap': True, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 200}
0.8421456602586936 {'bootstrap': True, 'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 200}
0.8394565255757389 {'bootstrap': True, 'max_depth': 50, 'max_features': 'auto', 'n_estimators': 200}
0.8382030079289444 {'bootstrap': True, 'max_depth': 50, 'max_features': 'sqrt', 'n_estimators': 200}
0.84088989645203 {'bootstrap': True, 'max_depth': 100, 'max_features': 'auto', 'n_estimators': 200}
0.8392777954261768 {'bootstrap': True, 'max_depth': 100, 'max_features': 'sqrt', 'n_estimators': 200}
0.8419651652692343 {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'n_estimators': 200}
0.8399944808643223 {'bootstrap': True, 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}
0.8339017722201364 {'bootstrap': False, 'max_depth': 10, 'max_features': 'auto', 'n_estimators': 200}
0.8321099784047773 {'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'n_es

In [82]:
from sklearn.ensemble import VotingClassifier

vc = VotingClassifier(estimators=[('rf', RandomForestClassifier()), 
                                  ('xf', ExtraTreesClassifier()),
                                 ('af', AdaBoostClassifier())],
    voting='hard'
)

In [84]:
test_clf(vc, X_train, y_train.classifier, X_test, y_test.classifier)

training:  1.0
testing:  0.8636194723151245


In [96]:
from sklearn.linear_model import LinearRegression, BayesianRidge

test_clf(BayesianRidge(), X_train, y_train.regressor, X_test, y_test.regressor)

training:  0.9220493226195368
testing:  0.9217207648774112
