In [None]:
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt
import pandas_datareader as pdr

# Define the instruments to download. We would like to see Apple, Microsoft and the S&P500 index.
tickers = ['GOOG', 'MSFT', 'SPY']

# We would like all available data from 01/01/2000 until 12/31/2016.
start_date = '2012-10-01'
end_date = '2022-10-01'

#df = yf.Ticker('goog').history(interval='15m', start='2012-09-10', end='2022-09-10')
df = pdr.DataReader(tickers, 'yahoo', start_date, end_date)
df

In [None]:
# Getting just the adjusted closing prices. This will return a Pandas DataFrame
# The index in this DataFrame is the major index of the panel_data.
close = df['Close']

# Getting all weekdays between 01/01/2000 and 12/31/2016
all_weekdays = pd.date_range(start=start_date, end=end_date, freq='B')

# How do we align the existing prices in adj_close with our new set of dates?
# All we need to do is reindex close using all_weekdays as the new index
close = close.reindex(all_weekdays)

# Reindexing will insert missing values (NaN) for the dates that were not present
# in the original set. To cope with this, we can fill the missing by replacing them
# with the latest available price for each instrument.
close = close.fillna(method='ffill')

close.dropna(inplace=True)
close.describe()

In [None]:
def visualize_equity(df, index):
    # Get the MSFT timeseries. This now returns a Pandas Series object indexed by date.
    idx = df.loc[:, index]

    # Calculate the 20 and 100 days moving averages of the closing prices
    short_rolling_idx = idx.rolling(window=20).mean()
    long_rolling_idx = idx.rolling(window=100).mean()

    # Plot everything by leveraging the very powerful matplotlib package
    fig, ax = plt.subplots(figsize=(16,9))

    ax.plot(idx.index, idx, label=index)
    ax.plot(short_rolling_idx.index, short_rolling_idx, label='20 days rolling')
    ax.plot(long_rolling_idx.index, long_rolling_idx, label='100 days rolling')

    ax.set_xlabel('Date')
    ax.set_ylabel('Adjusted closing price ($)')
    ax.legend()
    plt.show()

visualize_equity(close, 'MSFT')

In [None]:
visualize_equity(close, 'GOOG')

In [None]:
visualize_equity(close, 'SPY')

In [None]:
data=close.copy()

# Identify dates as datetimes
def str_to_datetime(s):
        import datetime
        split = s.split('-')
        year, month, day = int(split[0]), int(split[1]), int(split[2])
        return datetime.datetime(year=year, month=month, day=day)

if (data.index.dtype != '' and data.index.dtype != 'datetime64[ns]'):
    data.reset_index(inplace=True)
    data['Date'] = df['Date'].apply(str_to_datetime)
else: 
    print('Date is already Datetime')

In [None]:
close['MSFT']

In [None]:
# Start day second time around: '2021-03-25'
df=pd.DataFrame(close['MSFT'])
X_timesteps=3
start_date='2017-10-01'
end_date='2022-10-01'

# Phrase the data as the info considered in a daily window of time
def df_to_windowed_df(dataframe, first_date_str, last_date_str, n=3):
    # n is the number of timesteps (X)
    import numpy as np
    first_date = str_to_datetime(first_date_str)
    last_date  = str_to_datetime(last_date_str)

    target_date = first_date

    dates = []
    X, Y = [], []

    last_time = False
    while True:
        df_subset = dataframe.loc[:target_date].tail(n+1)

        if len(df_subset) != n+1:
            print(f'Error: Window of size {n} is too large for date {target_date}')
            return

        values = df_subset['Close'].to_numpy()
        x, y = values[:-1], values[-1]

        dates.append(target_date)
        X.append(x)
        Y.append(y)

        next_week = dataframe.loc[target_date:target_date+datetime.timedelta(days=7)]
        next_datetime_str = str(next_week.head(2).tail(1).index.values[0])
        next_date_str = next_datetime_str.split('T')[0]
        year_month_day = next_date_str.split('-')
        year, month, day = year_month_day
        next_date = datetime.datetime(day=int(day), month=int(month), year=int(year))

        if last_time:
            break

        target_date = next_date

        if target_date == last_date:
            last_time = True

    ret_df = pd.DataFrame({})
    ret_df['Target Date'] = dates

    X = np.array(X)
    for i in range(0, n):
        X[:, i]
        ret_df[f'Target-{n-i}'] = X[:, i]

    ret_df['Target'] = Y

    return ret_df

windowed_df = df_to_windowed_df(df,start_date,end_date,n=X_timesteps)
windowed_df

In [None]:
# Splits the df into three cruicial tensor components: (the date, input, target)
def windowed_df_to_date_X_y(windowed_dataframe, number_of_y_vars=1):
    df_as_np = windowed_dataframe.to_numpy()

    dates = df_as_np[:, 0] #dates kept in first column, not as index

    middle_matrix = df_as_np[:, 1:-number_of_y_vars] #all x (after dates, before y)
    X = middle_matrix.reshape((len(dates), middle_matrix.shape[1], number_of_y_vars))

    Y = df_as_np[:, -number_of_y_vars:]

    return dates, X.astype(np.float32), Y.astype(np.float32)

dates, X, y = windowed_df_to_date_X_y(windowed_df)

dates.shape, X.shape, y.shape

In [None]:
# Split for 80% train, 10% validation, 10% test
q_80 = int(len(dates) * .8)
q_90 = int(len(dates) * .9)

dates_train, X_train, y_train = dates[:q_80], X[:q_80], y[:q_80]

dates_val, X_val, y_val = dates[q_80:q_90], X[q_80:q_90], y[q_80:q_90]
dates_test, X_test, y_test = dates[q_90:], X[q_90:], y[q_90:]

plt.plot(dates_train, y_train)
plt.plot(dates_val, y_val)
plt.plot(dates_test, y_test)

plt.legend(['Train', 'Validation', 'Test'])

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import layers

model = Sequential([layers.Input((3, 1)),
                    layers.LSTM(64),
                    layers.Dense(32, activation='relu'),
                    layers.Dense(32, activation='relu'),
                    layers.Dense(1)])

model.compile(loss='mse',  
                optimizer=Adam(learning_rate=0.001), 
                metrics=['mean_absolute_error','root_mean_squared_error'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100)

In [None]:
train_predictions = model.predict(X_train).flatten()

plt.plot(dates_train, train_predictions)
plt.plot(dates_train, y_train)
plt.legend(['Training Predictions', 'Training Observations'])

In [None]:
val_predictions = model.predict(X_val).flatten()

plt.plot(dates_val, val_predictions)
plt.plot(dates_val, y_val)
plt.legend(['Validation Predictions', 'Validation Observations'])

In [None]:
test_predictions = model.predict(X_test).flatten()

plt.plot(dates_test, test_predictions)
plt.plot(dates_test, y_test)
plt.legend(['Testing Predictions', 'Testing Observations'])

In [None]:
plt.plot(dates_train, train_predictions)
plt.plot(dates_train, y_train)
plt.plot(dates_val, val_predictions)
plt.plot(dates_val, y_val)
plt.plot(dates_test, test_predictions)
plt.plot(dates_test, y_test)
plt.legend(['Training Predictions', 
            'Training Observations',
            'Validation Predictions', 
            'Validation Observations',
            'Testing Predictions', 
            'Testing Observations'])