In [1]:
import pandas_datareader as pdr
from utils import *
import os
import time
import numpy as np
import datetime
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
import matplotlib.dates as mdates
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import BatchNormalization,Bidirectional,TimeDistributed,Dense,Input,Conv2D,MaxPool1D,Activation,Dropout,Flatten,Conv1D,concatenate,Embedding,LSTM
# from tensorflow.compat.v1.keras.layers import CuDNNLSTM
import tensorflow as tf
from sklearn.externals import joblib 
from tensorflow.keras.initializers import he_normal,glorot_normal,he_uniform
from tensorflow.keras.utils import to_categorical

import math

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from sklearn.metrics import accuracy_score

# import tensorflow as tf
# physical_devices = tf.config.list_physical_devices('GPU')
# tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)



Data Collection

In [2]:
related_companies = ['AMD','INTC','NVDA','MSFT','QCOM','TSM','SPY']

List of companies that are realted to AMD

Methodology:
    1) The training will be done to predict each company stock price for each loop
    
        Ex: Loop 1:
            Target Varaible : 'AMD': AMD
            Data : 'INTC' : Intel ,'NVDA': Nvidia,'MSFT' : Microsoft,'QCOM': Qualcom,'TSM': TSMC,'SPY' : SPDR S&P 500 Trust ETF
            Loop 2:
            Target Varaible : 'INTC' : Intel
            Data :  'AMD': AMD,'NVDA': Nvidia,'MSFT' : Microsoft,'QCOM': Qualcom,'TSM': TSMC,'SPY' : SPDR S&P 500 Trust ETF
        And so on
    2) We have considered training data only till May 31st 2020.
       Test data from Jan 1st to May 31 2020.
        But have not given June 2020 as input instead we run the model and get June 1st 2020 data for each company 
        and then 
        we predict June 2nd 2020 data.
        
        This way we can run the model to simulate the future inputs to the model and predict the future when the data is not available.

In [3]:
def model_trainer(list_of_companies):
    for company in list_of_companies:
        # Fetching data from Jan 1st 2010 to Dec 31st 2020
        # Please note that we will training a seperate model for each company
        for stock_name in list_of_companies:
            if not os.path.isfile("Data_train/"+ stock_name + '.csv'):
                df = pdr.get_data_tiingo(symbols=[stock_name],start='1/1/2010',end = '05/31/2020', api_key='bc41cd89fc117b78bbde26797c2f1b72b7aa83c2')
                df.to_csv( "Data_train/"+ stock_name + '.csv')
                df = pd.read_csv("Data_train/"+ stock_name + '.csv')
                df['date'] = pd.to_datetime(df['date'])
                df['date'] = df['date'].apply(lambda x: x.date())
                df.to_csv( "Data_train/"+ stock_name + '.csv')
            elif stock_name == company:
                company_df = pd.read_csv('Data_train/' + company + '.csv')
            else:
                pass
        # Reading the data for target company
        company_df = pd.read_csv('Data_train/' + company + '.csv',index_col=0)
        
        company_df = company_df[['date','close']]
        def technical_indicators(df):
            # Moving Average
            df['week_moving_avg'] = df['close'].rolling(window=7,min_periods=1).mean()
            df['21d_moving_avg'] = df['close'].rolling(window=21,min_periods=1).mean()

            # MACD
            df['12_ema'] = df[['close']].ewm(span=12).mean()
            df['26_ema'] = df[['close']].ewm(span=26).mean()
            df['macd'] = df['12_ema'] - df['26_ema']

            # Bollinger Bands
            df['20_day_std'] = df['close'].rolling(window=20,min_periods=0).std()
            df['upper_band'] = df['21d_moving_avg'] + (df['20_day_std']*2)
            df['lower_band'] = df['21d_moving_avg'] - (df['20_day_std']*2)

            return df
        # Adding technical_indicators for the dataset
        company_df = technical_indicators(company_df)
                
        # Adding forier transform feature to the data
        close_fft = np.fft.fft(np.asarray(company_df['close'].tolist()))
        fft_df = pd.DataFrame({'fft':close_fft})
        fft_df['absolute'] = fft_df['fft'].apply(lambda x: np.abs(x))
        fft_df['angle'] = fft_df['fft'].apply(lambda x: np.angle(x))
        
        
        company_df = pd.concat([company_df,fft_df],axis=1)
        
        company_df = company_df.drop(['date'],axis=1)
        company_df = company_df.drop(['fft'],axis=1)
        
        # Adding other related companies closing period for the data
        for others in list_of_companies:
            if others == company:
                pass
            else:
                temp_df = pd.read_csv("Data_train/"+ others + '.csv',index_col=0)
                company_df[others] = temp_df['close']
                
        
        company_df['target'] = company_df['close']
        company_df = company_df.drop(['close'],axis=1)
        
        # Defining scaling function
        scale = MinMaxScaler()
        
        company_df_scale = scale.fit_transform(company_df)
        company_df_scale =  pd.DataFrame(company_df_scale,columns=company_df.columns)
        # Saving scalar for production purposes
        joblib.dump(scale,'min_max_scaler_' + company + '.pkl')
        
        # This function will convert the data into T-20 : T+70 per row
        def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
            n_vars = 1 if type(data) is list else data.shape[1]
            df = pd.DataFrame(data)
            cols, names = list(), list()
            # input sequence (t-n, ... t-1)
            for i in range(n_in, 0, -1):
                cols.append(df.shift(i))
                names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
            # forecast sequence (t, t+1, ... t+n)
            for i in range(0, n_out):
                cols.append(df.shift(-i))
                if i == 0:
                    names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
                else:
                    names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
            # put it all together
            agg = pd.concat(cols, axis=1)
            agg.columns = names
            # drop rows with NaN values
            if dropnan:
                agg.dropna(inplace=True)
            return agg
        
        print(company_df_scale.shape)
        time_series_data = series_to_supervised(company_df_scale, 20,50)
        # Model Parameters
        past_time = 70
        no_of_features = 17
        observation_set = past_time * no_of_features
        print(time_series_data.shape)
        train_X, train_y = time_series_data.values[:, :observation_set], time_series_data.values[:, -1]
        
        train_X = train_X.reshape(train_X.shape[0], past_time,no_of_features)
        
        tf.keras.backend.clear_session()

        # Model 
        model = Sequential()
        model.add(LSTM(500, input_shape=(train_X.shape[1], train_X.shape[2])))
        model.add(Dense(1))
        model.compile(loss='mae', optimizer='adam',metrics=['mse'])
        
        # Training model for stock price given few other related companies data
        model.fit(x=train_X,y=train_y,epochs=10,verbose=0)
        
        # Saving the model for production purposes
        model.save('time_series_model_' + company + '.h5')
        
        print(company + ' model saved')

In [4]:
model_trainer(related_companies)

(2619, 17)
(2549, 1190)
AMD model saved
(2619, 17)
(2549, 1190)
INTC model saved
(2619, 17)
(2549, 1190)
NVDA model saved
(2619, 17)
(2549, 1190)
MSFT model saved
(2619, 17)
(2549, 1190)
QCOM model saved
(2619, 17)
(2549, 1190)
TSM model saved
(2619, 17)
(2549, 1190)
SPY model saved
