# Introduction
Machine Learning Algorithm that takes an asset's high, low, close, open interest and volume information, trains and predicts if the future realized volatility will be higher or lower than a level.  Machine Learning clustering Decision Tree algorithm trains on a financial asset's daily market information.  Function allows user to set a range of volatilities to test after training.    

In [None]:
import pandas as pd
import numpy as np

In [2]:
from IPython.display import display # Allows the use of display() for DataFrames

# Pretty display for notebooks
%matplotlib inline

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
# Import the classifier from sklearn
from sklearn.tree import DecisionTreeClassifier

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [6]:
from sklearn.metrics import precision_recall_fscore_support

In [7]:
og = pd.read_csv(r"C:\Users\Matt\Desktop\shop.csv")

In [8]:
len(og)

1143

In [9]:
def vol_convert(vol):
    rate = vol/1600
    return rate

In [10]:
start_vol = 4
end_vol = 10
step = .15
hist_period1 = 10
hist_period2 = 20
hist_period3 = 30
volume_period1 = 2
volume_period2 = 4
forward_vol_period = 20
vol_to_test = 6
rate = vol_convert(vol_to_test)

In [11]:
og.tail()

Unnamed: 0,Date Time,Open,High,Low,Close,Change,Volume
1138,11/27/2019,337.51,342.65,330.08,341.0,4.81,2415200
1139,11/29/2019,339.9,344.0,335.51,336.75,-4.25,1206700
1140,12/2/2019,336.0,336.0,311.532,330.84,-5.91,2990200
1141,12/3/2019,321.01,351.31,320.0,350.66,19.82,3278200
1142,12/4/2019,350.0,376.9099,349.51,372.0,21.34,5607400


In [12]:
data = og.copy()

In [13]:
#process dataset ready for machine learning, add custome columns, forward vol uses mean close to close and abs change 

def process(dataframe, hist1, hist2, hist3, volume1, volume2, forward_vol_period, question_vol, cushion):
    
    
    dataframe['abs_change'] = dataframe['Change'].abs() / dataframe['Close']
    
    dataframe['high_move'] = (((dataframe.High - dataframe.Close.shift(1))/dataframe['Close']).abs())
    dataframe['low_move'] = (((dataframe.Low - data.Close.shift(1))/dataframe['Close']).abs())
    dataframe['max_move'] = dataframe[['high_move', 'low_move']].max(axis=1)
    
    dataframe['hist_max_1'] = dataframe.max_move.rolling(window=hist1).mean()
    dataframe['hist_max_2'] = dataframe.max_move.rolling(window=hist2).mean()
    dataframe['hist_max_3'] = dataframe.max_move.rolling(window=hist3).mean()
    
    dataframe['hist_change_1'] = dataframe.abs_change.rolling(window=hist1).mean()
    dataframe['hist_change_2'] = dataframe.abs_change.rolling(window=hist2).mean()
    dataframe['hist_change_3'] = dataframe.abs_change.rolling(window=hist3).mean()
    
    dataframe['avg_max_close'] = dataframe[['max_move', 'abs_change']].mean(axis=1)
    
    dataframe['hist_avgmax_1'] = dataframe.avg_max_close.rolling(window=hist1).mean()
    dataframe['hist_avgmax_2'] = dataframe.avg_max_close.rolling(window=hist2).mean()
    dataframe['hist_avgmax_3'] = dataframe.avg_max_close.rolling(window=hist3).mean()
    
    dataframe['hist_volume_1'] = dataframe.Volume.rolling(window=volume1).mean()
    dataframe['hist_volume_2'] = dataframe.Volume.rolling(window=volume2).mean()
    
    dataframe['backtothefuture'] = dataframe.avg_max_close.rolling(window=forward_vol_period).mean()
    
    dataframe['forward_avg_close_max'] = dataframe['backtothefuture'].shift(-forward_vol_period)
    
    
    
    
    
    
    dataframe['volatile'] = (dataframe['forward_avg_close_max'] > question_vol + cushion)
    dataframe = dataframe.applymap(lambda x: 1 if x == True else x)
    dataframe = dataframe.applymap(lambda x: 0 if x == False else x)
    #drop column 
    del dataframe['Date Time']
    del dataframe['Open']
    del dataframe['High']
    del dataframe['Low']
    del dataframe['Close']
    del dataframe['backtothefuture']
    
    return dataframe 

In [14]:
#set and process dataset 
new_data = process(data,hist_period1 ,hist_period2 ,hist_period3 ,volume_period1,volume_period2,forward_vol_period, rate, 0)

In [15]:
new_data.tail()

Unnamed: 0,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2,forward_avg_close_max,volatile
1138,4.81,2415200,0.014106,0.018944,0.017918,0.018944,0.02511,0.029153,0.036698,0.014966,0.018804,0.02429,0.016525,0.020038,0.023978,0.030494,2649900.0,1950275.0,,0
1139,-4.25,1206700,0.012621,0.008909,0.016303,0.016303,0.024954,0.028313,0.036207,0.015633,0.01855,0.023936,0.014462,0.020293,0.023431,0.030072,1810950.0,1970325.0,,0
1140,-5.91,2990200,0.017864,0.002267,0.076224,0.076224,0.031661,0.030781,0.036711,0.017397,0.01894,0.022729,0.047044,0.024529,0.02486,0.02972,2098450.0,2374175.0,,0
1141,19.82,3278200,0.056522,0.058376,0.030913,0.058376,0.033334,0.03159,0.037383,0.019767,0.020329,0.024051,0.057449,0.026551,0.025959,0.030717,3134200.0,2472575.0,,0
1142,21.34,5607400,0.057366,0.070564,0.003091,0.070564,0.037713,0.032843,0.037018,0.023385,0.021097,0.023734,0.063965,0.030549,0.02697,0.030376,4442800.0,3270625.0,,0


In [16]:
new_data['forward_avg_close_max'].describe()

count    1123.000000
mean        0.030070
std         0.008846
min         0.015286
25%         0.023171
50%         0.028567
75%         0.033776
max         0.059552
Name: forward_avg_close_max, dtype: float64

In [17]:
new_data['volatile'].value_counts()

1    1123
0      20
Name: volatile, dtype: int64

In [18]:
new_data = new_data.dropna(how = 'any')

In [19]:
new_data.head(20)

Unnamed: 0,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2,forward_avg_close_max,volatile
30,0.23,348900,0.007477,0.009753,0.065995,0.065995,0.061786,0.07277,0.062132,0.04073,0.044809,0.037647,0.036736,0.051258,0.05879,0.049889,249100.0,453750.0,0.045835,1
31,-1.57,187300,0.053786,0.013703,0.059267,0.059267,0.062717,0.073122,0.061825,0.041846,0.045099,0.037933,0.056526,0.052282,0.059111,0.049879,268100.0,247425.0,0.044049,1
32,-0.13,249100,0.004474,0.04095,0.013421,0.04095,0.06125,0.074209,0.059978,0.04082,0.044896,0.035476,0.022712,0.051035,0.059552,0.047727,218200.0,233650.0,0.046648,1
33,0.54,280400,0.018243,0.041554,0.001351,0.041554,0.062076,0.072007,0.060125,0.039429,0.041529,0.036024,0.029899,0.050753,0.056768,0.048074,264750.0,266425.0,0.047393,1
34,0.82,220000,0.026956,0.045694,0.00526,0.045694,0.060052,0.069876,0.060937,0.038187,0.038565,0.036616,0.036325,0.04912,0.054221,0.048776,250200.0,234200.0,0.04855,1
35,0.44,145500,0.014258,0.029488,0.002592,0.029488,0.055594,0.065027,0.061188,0.032589,0.036644,0.036933,0.021873,0.044091,0.050835,0.04906,182750.0,223750.0,0.050152,1
36,-1.52,357100,0.051806,0.005112,0.066462,0.066462,0.05305,0.064571,0.062563,0.030023,0.03581,0.038189,0.059134,0.041536,0.050191,0.050376,251300.0,250750.0,0.048491,1
37,0.05,148200,0.001701,0.018033,0.01395,0.018033,0.041771,0.06027,0.062496,0.021064,0.035222,0.037651,0.009867,0.031417,0.047746,0.050073,252650.0,217700.0,0.049655,1
38,2.88,1619800,0.089247,0.091726,0.007437,0.091726,0.048963,0.058808,0.063151,0.02935,0.035353,0.039804,0.090487,0.039156,0.04708,0.051478,884000.0,567650.0,0.046555,1
39,1.48,414100,0.043852,0.049185,0.018667,0.049185,0.050835,0.057044,0.063963,0.03118,0.037503,0.040524,0.046519,0.041008,0.047274,0.052243,1016950.0,634800.0,0.045254,1


In [20]:
# Store the 'Survived' feature in a new variable and remove it from the dataset
outcomes = new_data['volatile']


del new_data['volatile']
del new_data['forward_avg_close_max']


features = new_data

# Show the new dataset with 'Survived' removed
features.head()

Unnamed: 0,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2
30,0.23,348900,0.007477,0.009753,0.065995,0.065995,0.061786,0.07277,0.062132,0.04073,0.044809,0.037647,0.036736,0.051258,0.05879,0.049889,249100.0,453750.0
31,-1.57,187300,0.053786,0.013703,0.059267,0.059267,0.062717,0.073122,0.061825,0.041846,0.045099,0.037933,0.056526,0.052282,0.059111,0.049879,268100.0,247425.0
32,-0.13,249100,0.004474,0.04095,0.013421,0.04095,0.06125,0.074209,0.059978,0.04082,0.044896,0.035476,0.022712,0.051035,0.059552,0.047727,218200.0,233650.0
33,0.54,280400,0.018243,0.041554,0.001351,0.041554,0.062076,0.072007,0.060125,0.039429,0.041529,0.036024,0.029899,0.050753,0.056768,0.048074,264750.0,266425.0
34,0.82,220000,0.026956,0.045694,0.00526,0.045694,0.060052,0.069876,0.060937,0.038187,0.038565,0.036616,0.036325,0.04912,0.054221,0.048776,250200.0,234200.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.30, random_state=42)

In [22]:
# TODO: Define the classifier, and fit it to the data
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [23]:
# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate the accuracy
from sklearn.metrics import accuracy_score
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 1.0
The test accuracy is 1.0


In [24]:
# Training the model
model = DecisionTreeClassifier(max_depth=15, min_samples_leaf=20, min_samples_split=20)
model.fit(X_train, y_train)

# Making predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculating accuracies
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print('The training accuracy is', train_accuracy)
print('The test accuracy is', test_accuracy)

The training accuracy is 1.0
The test accuracy is 1.0


In [25]:
def process_sample(dataframe, hist1, hist2, hist3, volume1, volume2):
    
    dataframe['abs_change'] = dataframe['Change'].abs() / dataframe['Close']
    
    dataframe['high_move'] = (((dataframe.High - dataframe.Close.shift(1))/dataframe['Close']).abs())
    dataframe['low_move'] = (((dataframe.Low - data.Close.shift(1))/dataframe['Close']).abs())
    dataframe['max_move'] = dataframe[['high_move', 'low_move']].max(axis=1)
    
    dataframe['hist_max_1'] = dataframe.max_move.rolling(window=hist1).mean()
    dataframe['hist_max_2'] = dataframe.max_move.rolling(window=hist2).mean()
    dataframe['hist_max_3'] = dataframe.max_move.rolling(window=hist3).mean()
    
    dataframe['hist_change_1'] = dataframe.abs_change.rolling(window=hist1).mean()
    dataframe['hist_change_2'] = dataframe.abs_change.rolling(window=hist2).mean()
    dataframe['hist_change_3'] = dataframe.abs_change.rolling(window=hist3).mean()
    
    dataframe['avg_max_close'] = dataframe[['max_move', 'abs_change']].mean(axis=1)
    
    dataframe['hist_avgmax_1'] = dataframe.avg_max_close.rolling(window=hist1).mean()
    dataframe['hist_avgmax_2'] = dataframe.avg_max_close.rolling(window=hist2).mean()
    dataframe['hist_avgmax_3'] = dataframe.avg_max_close.rolling(window=hist3).mean()
    
    dataframe['hist_volume_1'] = dataframe.Volume.rolling(window=volume1).mean()
    dataframe['hist_volume_2'] = dataframe.Volume.rolling(window=volume2).mean()
    

    #drop column 
    del dataframe['Open']
    del dataframe['High']
    del dataframe['Low']
    del dataframe['Close']
    
    return dataframe 

In [26]:
new_og = process_sample(og, hist_period1 ,hist_period2 ,hist_period3 , volume_period1, volume_period2)

In [27]:
new_og.tail()

Unnamed: 0,Date Time,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2
1138,11/27/2019,4.81,2415200,0.014106,0.018944,0.017918,0.018944,0.02511,0.029153,0.036698,0.014966,0.018804,0.02429,0.016525,0.020038,0.023978,0.030494,2649900.0,1950275.0
1139,11/29/2019,-4.25,1206700,0.012621,0.008909,0.016303,0.016303,0.024954,0.028313,0.036207,0.015633,0.01855,0.023936,0.014462,0.020293,0.023431,0.030072,1810950.0,1970325.0
1140,12/2/2019,-5.91,2990200,0.017864,0.002267,0.076224,0.076224,0.031661,0.030781,0.036711,0.017397,0.01894,0.022729,0.047044,0.024529,0.02486,0.02972,2098450.0,2374175.0
1141,12/3/2019,19.82,3278200,0.056522,0.058376,0.030913,0.058376,0.033334,0.03159,0.037383,0.019767,0.020329,0.024051,0.057449,0.026551,0.025959,0.030717,3134200.0,2472575.0
1142,12/4/2019,21.34,5607400,0.057366,0.070564,0.003091,0.070564,0.037713,0.032843,0.037018,0.023385,0.021097,0.023734,0.063965,0.030549,0.02697,0.030376,4442800.0,3270625.0


In [28]:
del new_og['Date Time']

In [29]:
new_og.tail()

Unnamed: 0,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2
1138,4.81,2415200,0.014106,0.018944,0.017918,0.018944,0.02511,0.029153,0.036698,0.014966,0.018804,0.02429,0.016525,0.020038,0.023978,0.030494,2649900.0,1950275.0
1139,-4.25,1206700,0.012621,0.008909,0.016303,0.016303,0.024954,0.028313,0.036207,0.015633,0.01855,0.023936,0.014462,0.020293,0.023431,0.030072,1810950.0,1970325.0
1140,-5.91,2990200,0.017864,0.002267,0.076224,0.076224,0.031661,0.030781,0.036711,0.017397,0.01894,0.022729,0.047044,0.024529,0.02486,0.02972,2098450.0,2374175.0
1141,19.82,3278200,0.056522,0.058376,0.030913,0.058376,0.033334,0.03159,0.037383,0.019767,0.020329,0.024051,0.057449,0.026551,0.025959,0.030717,3134200.0,2472575.0
1142,21.34,5607400,0.057366,0.070564,0.003091,0.070564,0.037713,0.032843,0.037018,0.023385,0.021097,0.023734,0.063965,0.030549,0.02697,0.030376,4442800.0,3270625.0


In [30]:
last_date = new_og.index[-1]
last_date

1142

In [31]:
one_sample = [new_og.iloc[last_date,:]] 

In [32]:
model.predict(one_sample)

array([1], dtype=int64)

In [33]:
def tail_vol(period, dataframe):
    rate = dataframe['avg_max_close'].tail(period).mean()
    vol = round((rate*100*16),2)
    
    return vol

In [34]:
tail_vol(20, new_og)

43.15

In [35]:
answer = model.predict(one_sample)
answer = answer[0]
answer

1

In [36]:
def find_vol(dataframe, start_vol, end_vol, step, hist_period1, hist_period2, hist_period3, 
             volume_period1, volume_period2, forward_vol_period):
    vol = start_vol
    while vol < end_vol: 
        answer = 1 
        rate = vol_convert(vol)
        beg_frame = dataframe.copy()
        
        frame = process(dataframe,hist_period1 ,hist_period2 ,hist_period3 ,volume_period1,volume_period2,
                        forward_vol_period, rate, 0)
   
    
        frame = frame.dropna(how = 'any')
    
        outcomes = frame['volatile']
        features = frame

        del frame['volatile']
        del frame['forward_avg_close_max']
        X_train, X_test, y_train, y_test = train_test_split(features, outcomes, test_size=0.30, random_state=42)
        # Training the model
        model = DecisionTreeClassifier(max_depth=15, min_samples_leaf=20, min_samples_split=20)
        model.fit(X_train, y_train)

        # Making predictions
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        # Calculating accuracies
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        precision = precision_score(y_test, y_test_pred)
        recall = recall_score(y_test, y_test_pred)
        the_f1 = f1_score(y_test, y_test_pred)
        
   
    
        original_data = process_sample(beg_frame, hist_period1 ,hist_period2 ,hist_period3 , volume_period1, volume_period2)
        #original_data = original_data.dropna(how = 'any')
        
        del original_data['Date Time']
        last_row = original_data.index[-1]
        one_sample = [new_og.iloc[last_row,:]] 
    
        answer = model.predict(one_sample)
        answer = answer[0]
        print(vol)
        print(answer)
        print('The training accuracy is', train_accuracy)
        print('The test accuracy is', test_accuracy)
        print('The precision is', precision)
        print('The recall is', recall)
        print('The F1 is', the_f1 )
        
        
        print('The number of days', len(outcomes))
        print('The percentage of volatile days', ((outcomes == 1).sum()) / len(outcomes))
        
        
        
        #print(frame.head(1))
    
        #if answer == 0:
            #break
        
        vol = vol + step


In [37]:
fv_dataframe = pd.read_csv(r"C:\Users\Matt\Desktop\shop.csv")
vol = 48
start_vol = 40
end_vol = 60
step = 1
hist_period1 = 10
hist_period2 = 15
hist_period3 = 20
volume_period1 = 4
volume_period2 = 8
forward_vol_period = 25
rate = vol_convert(vol)

In [38]:
find_vol(fv_dataframe, start_vol, end_vol, step, hist_period1, hist_period2, hist_period3, 
         volume_period1, volume_period2, forward_vol_period)

40
1
The training accuracy is 0.8346354166666666
The test accuracy is 0.7212121212121212
The precision is 0.7404580152671756
The recall is 0.8899082568807339
The F1 is 0.8083333333333333
The number of days 1098
The percentage of volatile days 0.6930783242258652
41
1
The training accuracy is 0.8333333333333334
The test accuracy is 0.7757575757575758
The precision is 0.7878787878787878
The recall is 0.8792270531400966
The F1 is 0.8310502283105022
The number of days 1098
The percentage of volatile days 0.6493624772313297
42
1
The training accuracy is 0.8177083333333334
The test accuracy is 0.7242424242424242
The precision is 0.7864583333333334
The recall is 0.7512437810945274
The F1 is 0.7684478371501272
The number of days 1098
The percentage of volatile days 0.6165755919854281
43
1
The training accuracy is 0.82421875
The test accuracy is 0.7151515151515152
The precision is 0.7373737373737373
The recall is 0.776595744680851
The F1 is 0.7564766839378237
The number of days 1098
The percenta

In [39]:
ex = process(fv_dataframe,hist_period1 ,hist_period2 ,hist_period3 ,volume_period1,volume_period2,
                        forward_vol_period, rate, 0)

In [40]:
del ex['forward_avg_close_max']
del ex['volatile']

test_rate = vol_convert(vol)

ex['volatile'] = (ex['avg_max_close'] > test_rate)

ex = ex.applymap(lambda x: 0 if x == False else x)
ex = ex.applymap(lambda x: 1 if x == True else x)

In [41]:
ex = ex.dropna(how = 'any')
len(ex)

1123

In [42]:
print('the 5 day scalp average vol is', tail_vol(5, ex) )
print('the 10 day scalp average vol is', tail_vol(10, ex) )
print('the 20 day scalp average vol is', tail_vol(20, ex) )
print('the 40 day scalp average vol is', tail_vol(40, ex) )
print('the 60 day scalp average vol is', tail_vol(60, ex) )
print('the 90 day scalp average vol is', tail_vol(40, ex) )
print('the 120 day scalp average vol is', tail_vol(60, ex) )


the 5 day scalp average vol is 63.82
the 10 day scalp average vol is 48.88
the 20 day scalp average vol is 43.15
the 40 day scalp average vol is 52.23
the 60 day scalp average vol is 51.79
the 90 day scalp average vol is 52.23
the 120 day scalp average vol is 51.79


In [43]:
fourty = ex.tail(40)
thirty = ex.tail(30)
twenty = ex.tail(20)
ten = ex.tail(10)
five = ex.tail(5)

#new['volatile'].sum()/len(new)

In [44]:
print('vol is', vol)
print('the percentage of volatile days over the last 5 days:', five['volatile'].sum()/len(five) )
print('the percentage of volatile days over the last 10 days:', ten['volatile'].sum()/len(ten) )
print('the percentage of volatile days over the last 20 days:', twenty['volatile'].sum()/len(twenty) )
print('the percentage of volatile days over the last 30 days:', thirty['volatile'].sum()/len(thirty) )
print('the percentage of volatile days over the last 40 days:', fourty['volatile'].sum()/len(fourty) )


vol is 48
the percentage of volatile days over the last 5 days: 0.6
the percentage of volatile days over the last 10 days: 0.4
the percentage of volatile days over the last 20 days: 0.35
the percentage of volatile days over the last 30 days: 0.4
the percentage of volatile days over the last 40 days: 0.425


In [45]:
five.tail(1)

Unnamed: 0,Change,Volume,abs_change,high_move,low_move,max_move,hist_max_1,hist_max_2,hist_max_3,hist_change_1,hist_change_2,hist_change_3,avg_max_close,hist_avgmax_1,hist_avgmax_2,hist_avgmax_3,hist_volume_1,hist_volume_2,volatile
1142,21.34,5607400,0.057366,0.070564,0.003091,0.070564,0.037713,0.033473,0.032843,0.023385,0.021458,0.021097,0.063965,0.030549,0.027465,0.02697,3270625.0,2610450.0,1


In [46]:
len(ex)

1123