In [1]:
# Based on "Stock Market Forecasting Using Machine Learning Algorithms"
# by Shunrong Shen, Haomiao Jiang, Tongda Zhang
# https://pdfs.semanticscholar.org/b68e/8d2f4d2c709bb5919b82effcb6a7bbd3db37.
# Data from yahoo.com and investing.com
# !pip install TPOT

# ===== 0 Utils and Consts
import time
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

# =====

PATH_ROOT = r'C:/Users/Motohiro/Downloads/'
PATH_FILE = PATH_ROOT + 'world_SPY_a.csv'
PATH_EXPORT = PATH_ROOT + 'model_world_SPY_a_v3.py'

# =====

# Utils
start_time = time.time()
def watch_restart():
    global start_time
    start_time = time.time()
def watch_print(title):
    global start_time
    print(title,round(time.time() - start_time, 4), 'seconds')

In [2]:
watch_restart()

# ===== 1.0 Get Data from CSV
df = pd.read_csv(PATH_FILE)
print(df.columns)

watch_print('Get Data')

Index(['Id', 'PriceDate', 'YSpydiffActual', 'YSpyactual', 'YSpypredicted',
       'YSpypredictedProb', 'YDiadiffActual', 'YDiaactual', 'YDiapredicted',
       'YDiapredictedProb', 'XFtsediffNorm', 'XStoxxdiffNorm',
       'XGdaxidiffNorm', 'XSsmidiffNorm', 'XN225diffNorm', 'XAxjodiffNorm',
       'XHsidiffNorm'],
      dtype='object')
Get Data 0.0259 seconds


In [3]:
# Split data
def get_x_and_y(df):
  y = np.where(df['YSpydiffActual'] > 0, 1, 0)
  X = df[['XFtsediffNorm',
          'XStoxxdiffNorm',
          'XGdaxidiffNorm',
          'XSsmidiffNorm',
          'XN225diffNorm', 
          'XAxjodiffNorm', 
          'XHsidiffNorm']] 
  return X, y

X, y = get_x_and_y(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X', X.head())
print('y', y)
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)
print('y_train shape', y_train.shape)
print('y_test shape', y_test.shape)

X    XFtsediffNorm  XStoxxdiffNorm  XGdaxidiffNorm  XSsmidiffNorm  \
0         0.0144          0.0166          0.0150         0.0142   
1         0.0029          0.0013          0.0034         0.0088   
2         0.0058          0.0201          0.0196         0.0071   
3        -0.0025         -0.0016         -0.0049        -0.0028   
4        -0.0048         -0.0115         -0.0097        -0.0028   

   XN225diffNorm  XAxjodiffNorm  XHsidiffNorm  
0        -0.0203         0.0011        0.0008  
1         0.0264         0.0114        0.0137  
2        -0.0037        -0.0005       -0.0030  
3         0.0101         0.0054       -0.0013  
4         0.0063         0.0086       -0.0068  
y [1 0 1 ... 1 0 0]
X_train shape (867, 7)
X_test shape (217, 7)
y_train shape (867,)
y_test shape (217,)


In [None]:
model = TPOTClassifier(
    generations=10000,
    verbosity=2,
    n_jobs=-1, 
    config_dict='TPOT light')
model.fit(X_train, y_train)
print('Score', model.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=1000100.0, style=ProgressStyl…

Generation 1 - Current best internal CV score: 0.5754700684339912
Generation 2 - Current best internal CV score: 0.5754700684339912
Generation 3 - Current best internal CV score: 0.5754700684339912
Generation 4 - Current best internal CV score: 0.5754700684339912
Generation 5 - Current best internal CV score: 0.5754700684339912
Generation 6 - Current best internal CV score: 0.578931632449671
Generation 7 - Current best internal CV score: 0.578931632449671
Generation 8 - Current best internal CV score: 0.578931632449671
Generation 9 - Current best internal CV score: 0.578931632449671
Generation 10 - Current best internal CV score: 0.578931632449671
Generation 11 - Current best internal CV score: 0.578931632449671
Generation 12 - Current best internal CV score: 0.5812637034084114
Generation 13 - Current best internal CV score: 0.5812637034084114
Generation 14 - Current best internal CV score: 0.5812637034084114
Generation 15 - Current best internal CV score: 0.5812637034084114
Generation

Generation 124 - Current best internal CV score: 0.5927579562819746
Generation 125 - Current best internal CV score: 0.5927579562819746
Generation 126 - Current best internal CV score: 0.5927579562819746
Generation 127 - Current best internal CV score: 0.5927579562819746
Generation 128 - Current best internal CV score: 0.5927579562819746
Generation 129 - Current best internal CV score: 0.5927579562819746
Generation 130 - Current best internal CV score: 0.5927579562819746
Generation 131 - Current best internal CV score: 0.5927579562819746
Generation 132 - Current best internal CV score: 0.5927579562819746
Generation 133 - Current best internal CV score: 0.5927579562819746
Generation 134 - Current best internal CV score: 0.5927579562819746
Generation 135 - Current best internal CV score: 0.5927579562819746
Generation 136 - Current best internal CV score: 0.5927579562819746
Generation 137 - Current best internal CV score: 0.5927579562819746
Generation 138 - Current best internal CV score:

In [None]:
model.export(PATH_EXPORT)