#### Classification Model to determine if a coin price is moving up or down

In [7]:
import os
import pandas as pd
import numpy as np
import datetime
import glob
import random
import random_forest


# set seed for repeatable results
RANDOM_STATE = 4217
random.seed(4217)
RELEVANT_ATTRIBUTES = ['pc_sma15c_sma30c', 'pc_open_close', 'pc_high_low', 'pc_low_close', 'signal']
DATA_PATH='../indicator-data/'

# These attributes are the simple moving average percent changes, percent change from high to low, percent change from low to close, and the class attribute "signal"

##### Load Training Data

In [8]:
def load_data(filepath, keep_cols):   
    # read csv data
    df = pd.read_csv(filepath)
    
    # select only keep columns
    df = df[keep_cols]

    # drop missing rows
    df.dropna(inplace=True)
    
    return df

In [9]:
# load every coin into one df
dfs = []
for file in glob.glob(DATA_PATH+'*.csv'):
    df_to_add = load_data(file, RELEVANT_ATTRIBUTES)
    dfs.append(df_to_add)
df = pd.concat(dfs)
dfs.clear()

In [10]:
df

Unnamed: 0,pc_sma15c_sma30c,pc_open_close,pc_high_low,pc_low_close,signal
0,0.000000,0.041250,-0.067713,0.070057,0.0
1,0.000000,-0.021306,-0.120048,0.109246,0.0
2,0.000000,-0.021579,-0.097496,0.066573,0.0
3,0.000000,0.010181,-0.045895,0.041655,0.0
4,0.000000,-0.035367,-0.071147,0.025612,0.0
...,...,...,...,...,...
899,0.062599,0.002742,-0.091626,0.093073,0.0
900,0.083004,-0.238846,-0.246877,0.003540,0.0
901,0.102323,-0.042736,-0.223404,0.153251,0.0
902,0.114053,0.109020,-0.156783,0.152517,0.0


In [11]:
# split into test, train, validate with an 80/10/10 split respectively
train_df = df.sample(frac=0.8, random_state = RANDOM_STATE)
df = df.drop(train_df.index)
valid_df = df.sample(frac=0.5, random_state = RANDOM_STATE)
test_df = df.drop(valid_df.index)

In [12]:
test_df.columns

Index(['pc_sma15c_sma30c', 'pc_open_close', 'pc_high_low', 'pc_low_close',
       'signal'],
      dtype='object')

In [4]:
# create the Random Forest object and fit it:
clf = random_forest.Random_Forest(train_df, 'signal', column_prop=0.75,n_trees = 100, row_prop = 0.01)
clf.fit()

645 rows
tree 0 of 100
645 rows
tree 1 of 100
645 rows
tree 2 of 100
645 rows
tree 3 of 100
645 rows
tree 4 of 100
645 rows
tree 5 of 100
645 rows
tree 6 of 100
645 rows
tree 7 of 100
645 rows
tree 8 of 100
645 rows
tree 9 of 100
645 rows
tree 10 of 100
645 rows
tree 11 of 100
645 rows
tree 12 of 100
645 rows
tree 13 of 100
645 rows
tree 14 of 100
645 rows
tree 15 of 100
645 rows
tree 16 of 100
645 rows
tree 17 of 100
645 rows
tree 18 of 100
645 rows
tree 19 of 100
645 rows
tree 20 of 100
645 rows
tree 21 of 100
645 rows
tree 22 of 100
645 rows
tree 23 of 100
645 rows
tree 24 of 100
645 rows
tree 25 of 100
645 rows
tree 26 of 100
645 rows
tree 27 of 100
645 rows
tree 28 of 100
645 rows
tree 29 of 100
645 rows
tree 30 of 100
645 rows
tree 31 of 100
645 rows
tree 32 of 100
645 rows
tree 33 of 100
645 rows
tree 34 of 100
645 rows
tree 35 of 100
645 rows
tree 36 of 100
645 rows
tree 37 of 100
645 rows
tree 38 of 100
645 rows
tree 39 of 100
645 rows
tree 40 of 100
645 rows
tree 41 of 100
64

In [None]:
# save model
with open('classification.sav', 'wb') as fp:
    pickle.dump(fp)