In [1]:
import math
import os

import catboost as cb
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb

from IPython.display import display
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV


pd.set_option('display.max_columns', 50)
sns.set(style='darkgrid')

DATA_DIR = os.path.join(os.getcwd(), '../data')

In [2]:
TARGET_FILE = os.path.join(DATA_DIR, 'GOOG.csv')
df = pd.read_csv(TARGET_FILE)
df.index = pd.to_datetime(df['Date'])
df.drop(columns=['Date'], inplace=True)

stock_start = df.index.min().strftime('%Y-%m-%d')  # start date of stock entries
stock_end = df.index.max().strftime('%Y-%m-%d')  # end date of stock entries
biz_dates = pd.date_range(start=stock_start, end=stock_end, freq='B')  # business days without considering any holidays

# Find dates which appear in business days list but not in stock entries.
# This will give holidays.
custom_holidays = biz_dates.difference(df.index)

# Create a custom business day frequency using custom holidays.
custom_freq = pd.tseries.offsets.CustomBusinessDay(holidays=custom_holidays)

# Set frequency of datetime index to custom business day frequency.
df = df.asfreq(custom_freq)

print('Dataframe contains null value:', df.isnull().values.any())
display(df)

Dataframe contains null value: False


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,49.813286,51.835709,47.800831,49.982655,49.982655,44871300
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942800
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342800
2004-08-24,55.412300,55.591629,51.591621,52.239193,52.239193,15319700
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232100
...,...,...,...,...,...,...
2020-04-20,1271.000000,1281.599976,1261.369995,1266.609985,1266.609985,1695500
2020-04-21,1247.000000,1254.270020,1209.709961,1216.339966,1216.339966,2153000
2020-04-22,1245.540039,1285.613037,1242.000000,1263.209961,1263.209961,2093100
2020-04-23,1271.550049,1293.310059,1265.670044,1276.310059,1276.310059,1566200


In [3]:
# Adjust open, high, low and close prices

k = df['Adj Close'] / df['Close']
df['Open'] = k * df['Open']
df['High'] = k * df['High']
df['Low'] = k * df['Low']
df['Close'] = k * df['Close']
display(df)

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-08-19,49.813286,51.835709,47.800831,49.982655,49.982655,44871300
2004-08-20,50.316402,54.336334,50.062355,53.952770,53.952770,22942800
2004-08-23,55.168217,56.528118,54.321388,54.495735,54.495735,18342800
2004-08-24,55.412300,55.591629,51.591621,52.239193,52.239193,15319700
2004-08-25,52.284027,53.798351,51.746044,52.802086,52.802086,9232100
...,...,...,...,...,...,...
2020-04-20,1271.000000,1281.599976,1261.369995,1266.609985,1266.609985,1695500
2020-04-21,1247.000000,1254.270020,1209.709961,1216.339966,1216.339966,2153000
2020-04-22,1245.540039,1285.613037,1242.000000,1263.209961,1263.209961,2093100
2020-04-23,1271.550049,1293.310059,1265.670044,1276.310059,1276.310059,1566200


In [4]:
# Add some more features

df['High-Low Pct Change'] = (df['High'] - df['Low']) / df['Low']
df['Price Pct Change'] = (df['Close'] - df['Open']) / df['Open']

n_forecast_days = 5  # Forecast 5 days data
feature_cols = ['Open', 'High', 'Low', 'Close', 'High-Low Pct Change', 'Price Pct Change']
target_col = 'Forecasted Close'
df['Forecasted Close'] = df['Close'].shift(-n_forecast_days)
df = df[feature_cols + [target_col]].copy()

corr = df.dropna().drop(target_col, axis=1).apply(lambda x: x.corr(df[target_col]))

display(df)
display(corr)

Unnamed: 0_level_0,Open,High,Low,Close,High-Low Pct Change,Price Pct Change,Forecasted Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-08-19,49.813286,51.835709,47.800831,49.982655,0.084410,0.003400,53.753517
2004-08-20,50.316402,54.336334,50.062355,53.952770,0.085373,0.072270,52.876804
2004-08-23,55.168217,56.528118,54.321388,54.495735,0.040624,-0.012190,50.814533
2004-08-24,55.412300,55.591629,51.591621,52.239193,0.077532,-0.057264,50.993862
2004-08-25,52.284027,53.798351,51.746044,52.802086,0.039661,0.009909,49.937820
...,...,...,...,...,...,...,...
2020-04-20,1271.000000,1281.599976,1261.369995,1266.609985,0.016038,-0.003454,
2020-04-21,1247.000000,1254.270020,1209.709961,1216.339966,0.036835,-0.024587,
2020-04-22,1245.540039,1285.613037,1242.000000,1263.209961,0.035115,0.014187,
2020-04-23,1271.550049,1293.310059,1265.670044,1276.310059,0.021838,0.003743,


Open                   0.997877
High                   0.997942
Low                    0.998042
Close                  0.998073
High-Low Pct Change   -0.223818
Price Pct Change       0.028182
dtype: float64

In [5]:
# Drop features that have low correlation with target variable

feature_cols = ['Open', 'High', 'Low', 'Close']
df.drop(['High-Low Pct Change', 'Price Pct Change'], axis=1, inplace=True)

In [6]:
# Prepare train/test data

X = np.array(df[feature_cols].copy())
X_future = X[-n_forecast_days:]
X = X[:-n_forecast_days]
y = np.array(df[target_col])[:-n_forecast_days]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
display(X_future)

array([[1271.      , 1281.599976, 1261.369995, 1266.609985],
       [1247.      , 1254.27002 , 1209.709961, 1216.339966],
       [1245.540039, 1285.613037, 1242.      , 1263.209961],
       [1271.550049, 1293.310059, 1265.670044, 1276.310059],
       [1261.170044, 1280.400024, 1249.449951, 1279.310059]])

In [7]:
# Use a boosting algorithms

def cboost(X_train, y_train, X_test, y_test, evaluate=False):
    estimator = cb.CatBoostRegressor(num_leaves=31)
    fit_params={
        'early_stopping_rounds': 42,
        'eval_set': (X_test, y_test)
    }
    param_grid = {
        'boosting_type': ['Ordered', 'Plain'],
        'learning_rate': [0.01, 0.05, 0.1, 1],
        'n_estimators': [20, 40, 60, 80, 100],
        'verbose': [0]
    }
    gbm = GridSearchCV(estimator, param_grid, cv=3, n_jobs=3)
    if evaluate:
        gbm.fit(X_train, y_train, **fit_params)
    else:
        gbm.fit(X_train, y_train)
    print('CatBoost ====================================')
    print('Best parameters found by grid search are:', gbm.best_params_)
    y_pred = gbm.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('The mse of prediction is:', mse)
    print('The rmse of prediction is:', mse ** 0.5)
    y_future = gbm.predict(X_future)
    print('Forecasts:', y_future)


def lgboost(X_train, y_train, X_test, y_test, evaluate=False):
    estimator = lgb.LGBMRegressor(num_leaves=31)
    fit_params={
        'early_stopping_rounds': 5,
        'eval_set': (X_test, y_test),
        'eval_metric': 'l1'
    }
    param_grid = {
        'boosting_type': ['gbdt', 'rf', 'goss', 'dart'],
        'learning_rate': [0.01, 0.05, 0.1, 1],
        'n_estimators': [20, 40, 60, 80, 100],
        'verbose': [0]
    }
    gbm = GridSearchCV(estimator, param_grid, cv=3, n_jobs=3)
    if evaluate:
        gbm.fit(X_train, y_train, **fit_params)
    else:
        gbm.fit(X_train, y_train)
    print('LightGBM ====================================')
    print('Best parameters found by grid search are:', gbm.best_params_)
    y_pred = gbm.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('The mse of prediction is:', mse)
    print('The rmse of prediction is:', mse ** 0.5)
    y_future = gbm.predict(X_future)
    print('Forecasts:', y_future)


def xgboost(X_train, y_train, X_test, y_test, evaluate=False):
    estimator = xgb.XGBRegressor(num_leaves=31)
    fit_params={
        'early_stopping_rounds': 300,
        'eval_set': [(X_train, y_train), (X_test, y_test)]
    }
    param_grid = {
        'booster': ['gbtree', 'dart'],
        'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 1],
        'n_estimators': [20, 40, 60, 80, 100],
        'verbose': [0]
    }
    gbm = GridSearchCV(estimator, param_grid, cv=3, n_jobs=3)
    if evaluate:
        gbm.fit(X_train, y_train, verbose=0, **fit_params)
    else:
        gbm.fit(X_train, y_train, verbose=0)
    print('XGBoost  ====================================')
    print('Best parameters found by grid search are:', gbm.best_params_)
    y_pred = gbm.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    print('The mse of prediction is:', mse)
    print('The rmse of prediction is:', mse ** 0.5)
    y_future = gbm.predict(X_future)
    print('Forecasts:', y_future)

# parameters
# params = {
#     'boosting_type': 'rf',
#     'objective': 'regression',
#     'metric': {'l2', 'l1'},
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 0
# }

cboost(X_train, y_train, X_test, y_test)
lgboost(X_train, y_train, X_test, y_test)
xgboost(X_train, y_train, X_test, y_test)

Best parameters found by grid search are: {'boosting_type': 'Plain', 'learning_rate': 0.1, 'n_estimators': 100, 'verbose': 0}
The mse of prediction is: 521.374112631005
The rmse of prediction is: 22.833618036373583
Forecasts: [1243.45845599 1208.02361482 1241.98695582 1243.45845599 1245.42697788]
Best parameters found by grid search are: {'boosting_type': 'gbdt', 'learning_rate': 0.05, 'n_estimators': 100, 'verbose': 0}
The mse of prediction is: 481.55157958667525
The rmse of prediction is: 21.944283528670404
Forecasts: [1220.49023875 1191.8021642  1218.90742328 1255.11974705 1218.18608032]
Best parameters found by grid search are: {'booster': 'gbtree', 'learning_rate': 0.05, 'n_estimators': 100, 'verbose': 0}
The mse of prediction is: 464.5631218730589
The rmse of prediction is: 21.553726403410128
Forecasts: [1210.7921 1160.9059 1210.4154 1209.554  1210.7283]
