In [5]:
import pymongo
import pandas as pd
from pytz import timezone
import numpy as np
import matplotlib.pyplot as plt
import time
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split
from prophet import Prophet

In [6]:
# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["stock_data"]
collection = db["price_data"]

# Query all documents from the collection
results = collection.find({})

# Convert the query results to a pandas DataFrame
data = pd.DataFrame(list(results))

data.head(5)

Unnamed: 0,_id,Datetime,Open,High,Low,Close,Adj Close,Volume,ticker
0,653c089343527e00cd418ebe,2023-10-02 13:30:00,92.650002,92.830002,92.269997,92.690002,92.690002,186158,MMM
1,653c089343527e00cd418ebf,2023-10-02 13:31:00,92.650002,92.75,92.459999,92.5,92.5,20994,MMM
2,653c089343527e00cd418ec0,2023-10-02 13:32:00,92.449997,92.650002,92.419998,92.644997,92.644997,13678,MMM
3,653c089343527e00cd418ec1,2023-10-02 13:33:00,92.650002,92.665001,92.5,92.540001,92.540001,14558,MMM
4,653c089343527e00cd418ec2,2023-10-02 13:34:00,92.525002,92.580002,92.419998,92.449997,92.449997,20751,MMM


In [7]:
stocks = data.copy()

# Drop unnecessary columns
columns_to_drop = ["_id", "Adj Close"]
stocks = stocks.drop(columns=columns_to_drop)

# Create Change
stocks['Change'] = stocks['Close'] - stocks['Open']

# Drop 'Open' if it's no longer needed
stocks = stocks.drop(columns=['Open', 'High', 'Low'])

# Define rolling window sizes in minutes
window_sizes = {
    '10min': 10,
    '60min': 60,
    '3hr': 3 * 60,  # 3 hours in minutes
    '1day': 24 * 60,  # 1 day in minutes
    '5day': 5 * 24 * 60  # 5 days in minutes
}

# Calculate the moving averages for all stocks
for window_name, minutes in window_sizes.items():
    stocks[f'{window_name}_MA'] = stocks.groupby('ticker')['Close'].rolling(window=minutes, min_periods=1).mean().reset_index(level=0, drop=True)

# Fill NaN values with the first available value if there are any NaNs
stocks.fillna(method='bfill', inplace=True)

# Convert 'Datetime' to Eastern Time
eastern = timezone('US/Eastern')
stocks['Datetime_ET'] = stocks['Datetime'].dt.tz_localize('UTC').dt.tz_convert(eastern)

# Extract time features from 'Datetime_ET'
stocks['hour_of_day'] = stocks['Datetime_ET'].dt.hour + stocks['Datetime_ET'].dt.minute / 60
stocks['hour_of_day_normalized'] = 2 * np.pi * stocks['hour_of_day'] / 24
stocks['hour_sin'] = np.sin(stocks['hour_of_day_normalized'])
stocks['hour_cos'] = np.cos(stocks['hour_of_day_normalized'])

# Now convert 'Datetime' to UNIX timestamp if needed
stocks['Timestamp'] = stocks['Datetime'].astype('int64') // 1e9

# Drop the original 'Datetime' and 'Datetime_ET' if they are no longer needed
stocks = stocks.drop(columns=['Datetime', 'Datetime_ET'])

  stocks.fillna(method='bfill', inplace=True)


In [8]:
stocks.sort_values(['ticker', 'Timestamp'], inplace=True)

In [9]:
# Take 5 ticker symbols to validate code

# List of tickers we want to keep
tickers_to_keep = ['AAPL', 'ADBE', 'AMZN', 'MSFT', 'NVDA']

# Create a new DataFrame with only the specified tickers
five_stocks = stocks[stocks['ticker'].isin(tickers_to_keep)].copy()

# Now, five_stocks contains only the data for the five specified tickers
five_stocks.sort_values(['ticker', 'Timestamp'], inplace=True)
five_stocks.head(5)

In [13]:
# Select data for one stock
aapl_data = five_stocks[five_stocks['ticker'] == 'AAPL']

# Prepare the DataFrame for Prophet
# Assuming 'Timestamp' is a UNIX timestamp and 'Close' is the stock's closing price
aapl_data_for_prophet = pd.DataFrame({
    'ds': pd.to_datetime(aapl_data['Timestamp'], unit='s'),
    'y': aapl_data['Close']
})

# Split your data into training and temporary data (temporary will be split into validation and test next)
train_data, temp_data = train_test_split(aapl_data_for_prophet, train_size=0.7, shuffle=False)

# Split the temporary set into validation and test set
validation_data, test_data = train_test_split(temp_data, train_size=0.5, shuffle=False)

# Fit the model on the training data
model = Prophet(daily_seasonality=False, yearly_seasonality=False, weekly_seasonality=True)
model.fit(train_data)

# Make predictions on the validation set to tune your model
# You need to create a future dataframe that includes the dates from validation_data
future_validation = model.make_future_dataframe(periods=len(validation_data), freq='D', include_history=False)
forecast_validation = model.predict(future_validation)

# After tuning model parameters with the validation set, you assess the final model performance on the test set
# Prepare the future dataframe that includes the dates from test_data
future_test = model.make_future_dataframe(periods=len(test_data), freq='D', include_history=False)
forecast_test = model.predict(future_test)


15:21:30 - cmdstanpy - INFO - Chain [1] start processing
15:21:32 - cmdstanpy - INFO - Chain [1] done processing


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Evaluate performance on the validation set
mae_validation = mean_absolute_error(validation_data['y'], forecast_validation['yhat'])
mse_validation = mean_squared_error(validation_data['y'], forecast_validation['yhat'])
rmse_validation = np.sqrt(mse_validation)

# Print validation metrics
print(f"Validation MAE: {mae_validation}")
print(f"Validation MSE: {mse_validation}")
print(f"Validation RMSE: {rmse_validation}")

# Evaluate performance on the test set
mae_test = mean_absolute_error(test_data['y'], forecast_test['yhat'])
mse_test = mean_squared_error(test_data['y'], forecast_test['yhat'])
rmse_test = np.sqrt(mse_test)

# Print test metrics
print(f"Test MAE: {mae_test}")
print(f"Test MSE: {mse_test}")
print(f"Test RMSE: {rmse_test}")


Validation MAE: 2166.3299500785124
Validation MSE: 6251714.489712942
Validation RMSE: 2500.3428744300136
Test MAE: 2165.8281519074385
Test MSE: 6254893.008544331
Test RMSE: 2500.9784102515423


In [None]:
# Use the model to make predictions
forecast = model.predict(future)

# Plot the forecast
fig1 = model.plot(forecast)
ax1 = fig1.gca()  # get current axis
ax1.set_title('AAPL Stock Price Forecast', fontsize=16)  # set title
ax1.set_xlabel('Date', fontsize=12)  # set x-axis label
ax1.set_ylabel('Price (USD)', fontsize=12)  # set y-axis label
# Set x-axis major locator to month and formatter to DateFormatter
# Change it to mdates.DayLocator() or mdates.WeekdayLocator() if you prefer
ax1.xaxis.set_major_locator(mdates.MonthLocator())
ax1.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
plt.show()

# Plot forecast components
fig2 = model.plot_components(forecast)
axes2 = fig2.get_axes()
axes2[0].set_ylabel('Trend (USD)', fontsize=12)  # set y-axis label for trend
axes2[0].set_xlabel('Date', fontsize=12)  # set x-axis label for trend
# Adjust the x-axis labels for weekly seasonality if it's not clear
if len(axes2) > 1:
    axes2[1].set_ylabel('Weekly', fontsize=12)  # set y-axis label for weekly seasonality
    axes2[1].set_xlabel('Day of Week', fontsize=12)  # set x-axis label for weekly seasonality
# If you have yearly seasonality (which is not enabled in this case), you might also add:
# axes2[2].set_ylabel('Yearly', fontsize=12)
# axes2[2].set_xlabel('Day of Year', fontsize=12)
plt.show()