In [1]:
import time
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# =====

PATH_ROOT = r'C:/Users/Motohiro/Downloads/'
PATH_FILE = PATH_ROOT + 'world_SPY_a.csv'
LABEL_CLASS = { 'Down': 0, 'Up': 1 }
Y_DIFF_ACTUAL_COL = 'YSpydiffActual'
Y_ACTUAL_COL = 'YSpyactual'

# =====

# Utils
start_time = time.time()
def watch_restart():
    global start_time
    start_time = time.time()
def watch_print(title):
    global start_time
    print(title,round(time.time() - start_time, 4), 'seconds')

In [2]:
watch_restart()

# ===== 1.0 Get Data from CSV
df = pd.read_csv(PATH_FILE)
print(df.columns)

watch_print('Get Data')

Index(['Id', 'PriceDate', 'YSpydiffActual', 'YSpyactual', 'YSpypredicted',
       'YSpypredictedProb', 'YDiadiffActual', 'YDiaactual', 'YDiapredicted',
       'YDiapredictedProb', 'XFtsediffNorm', 'XStoxxdiffNorm',
       'XGdaxidiffNorm', 'XSsmidiffNorm', 'XN225diffNorm', 'XAxjodiffNorm',
       'XHsidiffNorm', 'XSensexdiffNorm'],
      dtype='object')
Get Data 0.013 seconds


In [3]:
# Split data
def get_x_and_y(df, col):
  y = np.where(df[col] > 0, 1, 0)
  X = df[['XFtsediffNorm',
          'XStoxxdiffNorm',
          'XGdaxidiffNorm',
          'XSsmidiffNorm',
          'XN225diffNorm', 
          'XAxjodiffNorm', 
          'XHsidiffNorm', 
          # 'XSsecdiffNorm',
          'XSensexdiffNorm']]
          # 'XNiftydiffNorm', 
          # 'XKs11diffNorm',
          # 'XTwiidiffNorm']]
  return X, y

df = df.sample(frac=1).reset_index(drop=True)
X, y = get_x_and_y(df, Y_DIFF_ACTUAL_COL)

print('X', X.head())
print('y', y)

X    XFtsediffNorm  XStoxxdiffNorm  XGdaxidiffNorm  XSsmidiffNorm  \
0        -0.0071         -0.0127         -0.0145        -0.0069   
1        -0.0011         -0.0034         -0.0022        -0.0042   
2         0.0087          0.0057          0.0061         0.0052   
3         0.0032          0.0061          0.0106         0.0033   
4         0.0206          0.0155          0.0114         0.0085   

   XN225diffNorm  XAxjodiffNorm  XHsidiffNorm  XSensexdiffNorm  
0        -0.0109        -0.0038       -0.0202          -0.0084  
1         0.0040        -0.0011        0.0034          -0.0086  
2         0.0014        -0.0065        0.0131           0.0042  
3        -0.0302        -0.0157       -0.0099          -0.0307  
4         0.0083         0.0010        0.0045           0.0032  
y [0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 1 0 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 1 1 0 1
 0 1 1 1 0 1 1 1 1 0 1 0 1 0 1 1 0 1 0 0 1 1 0 0 1 1 0 0 1 0 1 0 1 1 1 0 1
 1 1 1 1 0 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 1 

In [4]:
# Average CV score on the training set was: 0.7652694610778443
exported_pipeline = make_pipeline(
    make_union(
        make_pipeline(
            StackingEstimator(estimator=KNeighborsClassifier(n_neighbors=84, p=2, weights="uniform")),
            RBFSampler(gamma=0.8500000000000001)
        ),
        FunctionTransformer(copy)
    ),
    StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=1, min_samples_leaf=13, min_samples_split=6)),
    KNeighborsClassifier(n_neighbors=42, p=1, weights="uniform")
)

In [5]:
exported_pipeline.fit(X, y)
y_pred_proba = exported_pipeline.predict_proba(X)

index = 0
over = 0
over_correct = 0
y_classes = np.argmax(y_pred_proba, axis=1) 
for prob_1, prob_2 in y_pred_proba:
    result = 'correct'
    if df[Y_ACTUAL_COL][index] != y_classes[index]:
        result = 'wrong'
    prob = round(max(prob_1, prob_2) * 100, 2) 
    if prob > 85:
        over += 1
    if prob > 85 and result == 'correct':
        over_correct += 1
    print(df['PriceDate'][index],
          df[Y_ACTUAL_COL][index],
          y_classes[index], 
          prob,
          result)
    index += 1
print(index)
print(round(over_correct / over * 100, 2))
print(round(over / index * 100, 2))


11/20/2018 00:00:00 0 0 57.14 correct
05/18/2018 00:00:00 0 0 59.52 correct
12/12/2019 00:00:00 1 0 54.76 wrong
01/06/2015 00:00:00 0 0 57.14 correct
10/04/2016 00:00:00 0 1 52.38 wrong
07/25/2019 00:00:00 0 1 54.76 wrong
04/06/2016 00:00:00 1 1 66.67 correct
01/07/2015 00:00:00 1 0 50.0 wrong
03/24/2015 00:00:00 0 0 61.9 correct
09/07/2016 00:00:00 1 1 76.19 correct
03/19/2018 00:00:00 0 0 69.05 correct
08/10/2016 00:00:00 0 1 57.14 wrong
07/11/2019 00:00:00 0 1 73.81 wrong
10/11/2017 00:00:00 1 1 64.29 correct
04/18/2018 00:00:00 0 1 54.76 wrong
06/23/2016 00:00:00 1 0 52.38 wrong
08/20/2018 00:00:00 1 1 57.14 correct
10/31/2017 00:00:00 0 0 50.0 correct
10/19/2015 00:00:00 1 1 73.81 correct
06/03/2016 00:00:00 1 0 71.43 wrong
04/12/2018 00:00:00 1 1 64.29 correct
02/02/2015 00:00:00 1 1 64.29 correct
02/07/2018 00:00:00 0 1 54.76 wrong
02/25/2015 00:00:00 0 1 66.67 wrong
06/25/2018 00:00:00 0 0 69.05 correct
04/16/2015 00:00:00 1 1 61.9 correct
11/19/2019 00:00:00 0 0 54.76 correct


10/26/2017 00:00:00 0 1 57.14 wrong
05/24/2018 00:00:00 0 0 66.67 correct
01/22/2020 00:00:00 0 0 64.29 correct
07/20/2017 00:00:00 0 1 57.14 wrong
02/14/2020 00:00:00 1 1 69.05 correct
04/09/2018 00:00:00 0 1 59.52 wrong
12/11/2018 00:00:00 0 1 61.9 wrong
03/23/2017 00:00:00 0 1 64.29 wrong
05/20/2015 00:00:00 0 0 50.0 correct
06/05/2017 00:00:00 1 1 61.9 correct
12/02/2014 00:00:00 1 0 57.14 wrong
04/21/2017 00:00:00 0 0 66.67 correct
10/16/2018 00:00:00 1 1 80.95 correct
04/12/2019 00:00:00 1 0 57.14 wrong
12/15/2016 00:00:00 1 0 50.0 wrong
05/15/2019 00:00:00 1 1 71.43 correct
07/14/2015 00:00:00 1 1 64.29 correct
03/12/2019 00:00:00 1 0 66.67 wrong
04/22/2015 00:00:00 1 0 57.14 wrong
02/05/2015 00:00:00 1 0 57.14 wrong
07/03/2019 00:00:00 1 1 61.9 correct
11/14/2014 00:00:00 1 1 73.81 correct
05/16/2018 00:00:00 1 0 52.38 wrong
05/30/2018 00:00:00 1 1 57.14 correct
11/17/2016 00:00:00 1 0 61.9 wrong
12/18/2015 00:00:00 0 0 69.05 correct
01/30/2020 00:00:00 1 0 61.9 wrong
07/26/201