In [1]:
# Based on "Stock Market Forecasting Using Machine Learning Algorithms"
# by Shunrong Shen, Haomiao Jiang, Tongda Zhang
# https://pdfs.semanticscholar.org/b68e/8d2f4d2c709bb5919b82effcb6a7bbd3db37.
# Data from yahoo.com and investing.com
# !pip install TPOT

# ===== 0 Utils and Consts
import time
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tpot import TPOTClassifier

# =====

PATH_ROOT = r'C:/Users/Motohiro/Downloads/'
PATH_FILE = PATH_ROOT + 'world_SPY_a.csv'
PATH_EXPORT = PATH_ROOT + 'model_world_SPY_a_v2.py'

# =====

# Utils
start_time = time.time()
def watch_restart():
    global start_time
    start_time = time.time()
def watch_print(title):
    global start_time
    print(title,round(time.time() - start_time, 4), 'seconds')

In [2]:
watch_restart()

# ===== 1.0 Get Data from CSV
df = pd.read_csv(PATH_FILE)
print(df.columns)

watch_print('Get Data')

Index(['Id', 'PriceDate', 'YGspcdiffNorm', 'YGspcdiffActual', 'YGspcactual',
       'YGspcpredicted', 'YGspcpredictedProb', 'YSpydiffNorm',
       'YSpydiffActual', 'YSpyactual', 'YSpypredicted', 'YSpypredictedProb',
       'YDjidiffNorm', 'YDjidiffActual', 'YDjiactual', 'YDjipredicted',
       'YDjipredictedProb', 'YDiadiffNorm', 'YDiadiffActual', 'YDiaactual',
       'YDiapredicted', 'YDiapredictedProb', 'XFtsediffNorm', 'XStoxxdiffNorm',
       'XGdaxidiffNorm', 'XSsmidiffNorm', 'XN225diffNorm', 'XAxjodiffNorm',
       'XHsidiffNorm', 'XSsecdiffNorm', 'XBsesndiffNorm', 'XNiftydiffNorm',
       'XKs11diffNorm', 'XTwiidiffNorm'],
      dtype='object')
Get Data 0.0159 seconds


In [3]:
# Split data
def get_x_and_y(df):
  y = np.where(df['YSpydiffActual'] > 0, 1, 0)
  X = df[['XFtsediffNorm',
          'XStoxxdiffNorm',
          'XGdaxidiffNorm',
          #'XSsmidiffNorm',
          'XN225diffNorm', 
          'XAxjodiffNorm', 
          'XHsidiffNorm', 
          # 'XSsecdiffNorm',
          'XBsesndiffNorm']] 
          # 'XNiftydiffNorm', 
          # 'XKs11diffNorm',
          # 'XTwiidiffNorm']]
  return X, y

X, y = get_x_and_y(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X', X.head())
print('y', y)
print('X_train shape', X_train.shape)
print('X_test shape', X_test.shape)
print('y_train shape', y_train.shape)
print('y_test shape', y_test.shape)

X    XFtsediffNorm  XStoxxdiffNorm  XGdaxidiffNorm  XN225diffNorm  \
0        -0.0046         -0.0078         -0.0089            0.0   
1         0.0041          0.0089          0.0074            0.0   
2        -0.0015         -0.0022         -0.0019            0.0   
3        -0.0020         -0.0036         -0.0038            0.0   
4         0.0000          0.0033          0.0026            0.0   

   XAxjodiffNorm  XHsidiffNorm  XBsesndiffNorm  
0            0.0           0.0             0.0  
1            0.0           0.0             0.0  
2            0.0           0.0             0.0  
3            0.0           0.0             0.0  
4            0.0           0.0             0.0  
y [1 0 1 ... 1 0 1]
X_train shape (835, 7)
X_test shape (209, 7)
y_train shape (835,)
y_test shape (209,)


In [4]:
model = TPOTClassifier(
    generations=10000,
    verbosity=2,
    n_jobs=-1, 
    config_dict='TPOT light')
model.fit(X_train, y_train)
print('Score', model.score(X_test, y_test))

HBox(children=(FloatProgress(value=0.0, description='Optimization Progress', max=1000100.0, style=ProgressStyl…

Generation 1 - Current best internal CV score: 0.7413173652694611
Generation 2 - Current best internal CV score: 0.7413173652694611
Generation 3 - Current best internal CV score: 0.7413173652694611
Generation 4 - Current best internal CV score: 0.7449101796407186
Generation 5 - Current best internal CV score: 0.7461077844311378
Generation 6 - Current best internal CV score: 0.7461077844311378
Generation 7 - Current best internal CV score: 0.7461077844311378
Generation 8 - Current best internal CV score: 0.7461077844311378
Generation 9 - Current best internal CV score: 0.7461077844311378
Generation 10 - Current best internal CV score: 0.7461077844311378
Generation 11 - Current best internal CV score: 0.7461077844311378
Generation 12 - Current best internal CV score: 0.7461077844311378
Generation 13 - Current best internal CV score: 0.7461077844311378
Generation 14 - Current best internal CV score: 0.7461077844311378
Generation 15 - Current best internal CV score: 0.7461077844311378
Gene

In [5]:
model.export(PATH_EXPORT)