In [1]:
import pandas as pd
import datetime
import yfinance as yf

In [2]:
# Retrieve data

In [3]:
def get_ticker_data(symbol, period):
    '''
        Arguments:
            symbol --> Security symbol (str)
            period --> number of years (past x years) (int)
    '''   
    symbol = symbol.upper()
    period = str(period)+'y'
    print(period)
    stocks = yf.Ticker(symbol).history(period=period)    
    df = pd.DataFrame(stocks)
    return df

In [4]:
# Test dataset
data = get_ticker_data('NVDA', 10)
data.columns

10y


Index(['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits'], dtype='object')

In [5]:
data.reset_index()
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2012-02-09,3.791564,3.812233,3.736448,3.743337,56194400,0.0,0.0
2012-02-10,3.706594,3.720373,3.630809,3.651477,44026800,0.0,0.0
2012-02-13,3.743337,3.757116,3.633104,3.708889,61066800,0.0,0.0
2012-02-14,3.708890,3.761710,3.676739,3.729559,45375600,0.0,0.0
2012-02-15,3.764007,3.881130,3.704297,3.713483,111085600,0.0,0.0
...,...,...,...,...,...,...,...
2022-02-02,257.940002,258.170013,245.529999,252.419998,54341900,0.0,0.0
2022-02-03,244.580002,250.770004,237.800003,239.479996,41017800,0.0,0.0
2022-02-04,239.720001,246.350006,236.320007,243.190002,35432800,0.0,0.0
2022-02-07,243.720001,251.820007,242.020004,247.279999,37686800,0.0,0.0


In [6]:
# Prepare Data

In [9]:
def clean_data(df):
    ind = []
    open_col = []
    high = []
    low = []
    close = []
    vol = []
    
    for i in range(len(df)):
        open_col.append(df['Open'][i])
        high.append(df['High'][i])
        low.append(df['Low'][i])
        close.append(df['Close'][i])
        vol.append(df['Volume'][i])
        
    stocks = pd.DataFrame()
    stocks['Open'] = open_col
    stocks['High'] = high
    stocks['Low'] = low
    stocks['Close'] = close
    stocks['Volume'] = vol
    
    return stocks

In [10]:
data = clean_data(data)

In [11]:
data

Unnamed: 0,Open,High,Low,Close,Volume
0,3.791564,3.812233,3.736448,3.743337,56194400
1,3.706594,3.720373,3.630809,3.651477,44026800
2,3.743337,3.757116,3.633104,3.708889,61066800
3,3.708890,3.761710,3.676739,3.729559,45375600
4,3.764007,3.881130,3.704297,3.713483,111085600
...,...,...,...,...,...
2512,257.940002,258.170013,245.529999,252.419998,54341900
2513,244.580002,250.770004,237.800003,239.479996,41017800
2514,239.720001,246.350006,236.320007,243.190002,35432800
2515,243.720001,251.820007,242.020004,247.279999,37686800


In [12]:
def normalize_data(df, columns):
    for column in columns:
        temp = []
        min_value = min(df[column])
        max_value = max(df[column])
        for i in range(len(df)):
            temp.append((df[column][i] - min_value) / (max_value - min_value))
        df[column] = temp
    return df
            

In [14]:
data = normalize_data(data, ['Open', 'High', 'Low', 'Close', 'Volume'])

In [15]:
data

Unnamed: 0,Open,High,Low,Close,Volume
0,0.003416,0.003320,0.003700,0.003412,0.141557
1,0.003161,0.003053,0.003368,0.003135,0.108197
2,0.003271,0.003160,0.003375,0.003308,0.154916
3,0.003168,0.003173,0.003512,0.003371,0.111895
4,0.003333,0.003521,0.003599,0.003322,0.292056
...,...,...,...,...,...
2512,0.767834,0.743256,0.764631,0.754462,0.136478
2513,0.727650,0.721729,0.740305,0.715380,0.099947
2514,0.713033,0.708871,0.735647,0.726585,0.084634
2515,0.725064,0.724783,0.753585,0.738938,0.090814


In [26]:
x = pd.DataFrame()
x['Open'] = data['Open']
x

Unnamed: 0,Open
0,0.003416
1,0.003161
2,0.003271
3,0.003168
4,0.003333
...,...
2512,0.767834
2513,0.727650
2514,0.713033
2515,0.725064


In [16]:
# Model

In [21]:
# LSTM

In [19]:
import math
import numpy as np
from IPython.display import display
from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [20]:
def mean_squared_error(outputs, targets):
    squares_sum = 0
    for i in range(len(outputs)):
        squares_sum += (output[i] - targets[i])**2
    mse = float(squares_sum/len(outputs))
    return mse

In [45]:
def prepare_sets(df):
    y = pd.DataFrame()
    y['y'] = df['Close']
    y = y.to_numpy()
    
    x = df.drop(['Close'], axis=1)
    x = x.to_numpy()
    return x, y

In [47]:
x, y = prepare_sets(data)
x

array([[0.00341631, 0.00332029, 0.00370034, 0.14155739],
       [0.00316074, 0.00305307, 0.00336789, 0.10819667],
       [0.00327125, 0.00315996, 0.00337512, 0.15491637],
       ...,
       [0.71303262, 0.70887092, 0.73564728, 0.08463394],
       [0.72506366, 0.72478334, 0.75358533, 0.09081388],
       [0.72467264, 0.72617967, 0.74659893, 0.09962674]])

In [64]:
def split_data(x, y, prediction_period=1, sequence_length=50, test_data_size=0.2):
    test_data_cut = int(test_data_size * len(x)) + sequence_length + 1
    
    # Training data
    X_train = x[0:-prediction_period - test_data_cut]
    y_train = y[prediction_period:-test_data_cut]
    
    # Test data
    X_test = x[-test_data_cut : -prediction_period]
    y_test = y[prediction_period - test_data_cut:]
    
    return X_train, y_train, X_test, y_test

In [65]:
X_train, y_train, X_test, y_test = split_data(x, y)
len(y_test)

553

In [58]:
def prepare_sequences(inputs, outputs, sequence_length=50):
    x = []
    y = []
    
    for i in range(len(inputs)-sequence_length):
        x.append(inputs[i:i+sequence_length])
        y.append(outputs[i+sequence_length])
        
    return np.asarray(x), np.asarray(y)

In [62]:
train_inputs, train_outputs = prepare_sequences(X_train, y_train)
test_inputs, test_outputs = prepare_sequences(X_test, y_test)
len(test_inputs)

503

In [71]:
test_inputs.shape

(503, 50, 4)