<a href="https://www.kaggle.com/code/namanle/aapl-stock-prediction?scriptVersionId=142943763" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import datetime
import os
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


**Predicting AAPL closing price using the opening price as well as daily high, low, and volume.**

****Loading Data****

In [None]:
#reading the data, setting the index to dates, then changing the format of the dates.
df = pd.read_csv("../input/nyse/prices-split-adjusted.csv")
plot_x = df['date'].copy()
df.set_index("date", inplace = True)
df.index = pd.to_datetime(df.index)
df.head()

****Pre-Processing****

In [None]:
#checking for null values
df.isna().sum()

Since there are no missing values, no need to impute nor remove any data points.

In [None]:
#checking for duplicated values
df.duplicated().sum()

In [None]:
#dropping duplicated values
df.drop_duplicates(inplace = True)

There is no categorical variables used as feature data in this dataset (No need for ordinal encoding or one-hot.)

In [None]:
#plotting aapl open & close stock price
plt.figure(figsize=(20, 6));
plt.plot(df[df.symbol == 'AAPL'].open.values, color='red', label='open')
plt.plot(df[df.symbol == 'AAPL'].close.values, color='green', label='close')
plt.title('AAPL Stock Price')
plt.xlabel('Time [Days]')
plt.ylabel('Price')
plt.legend(loc='best')

In [None]:
#plotting aapl low & high price
plt.figure(figsize = (20, 6))
plt.plot(df[df.symbol == 'AAPL'].low.values, color='blue', label='low')
plt.plot(df[df.symbol == 'AAPL'].high.values, color='black', label='high')
plt.title('AAPL Stock price')
plt.xlabel('Time [days]')
plt.ylabel('Price')
plt.legend(loc='best')

In [None]:
#plotting aapl stock volume
plt.figure(figsize = (20, 6))
plt.plot(df[df.symbol == 'AAPL'].volume.values, color='black', label='volume')
plt.title('AAPL Stock Volume')
plt.xlabel('Time [days]')
plt.ylabel('Volume')
plt.legend(loc='best');

In [None]:
#Scaling the target and feature values
AAPL_stock = df[df['symbol'] == 'AAPL']

X_scaler = MinMaxScaler()
y_scaler = MinMaxScaler()
AAPL_df = AAPL_stock.copy()
AAPL_df.drop(['symbol'], axis=1, inplace=True)
X = AAPL_df[['open', 'low', 'high', 'volume']].copy()
y = AAPL_df['close'].copy()

X[['open', 'low', 'high', 'volume']] = X_scaler.fit_transform(X)
y = y_scaler.fit_transform(y.values.reshape(-1, 1))

In [None]:
#splitting the data, with 80% going to train, 20% going to test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size =.8, test_size=.2, random_state = 42)

**Modeling and Prediction**

In [None]:
#Traditionally, if there were multiple preprocessing steps for the data (such as dealing with missing values, categorical variables, etc.), 
#a pipeline could be used to bundle up the preprocessing steps. Then we could bundle the preprocessor and the model and simplify steps down but
#none of that is necessary here.

In [None]:
#using XGBoost. Although XGBoost is not the best model to use to predict in a time-series dataset such as this one, need to further 
#enhance knowledge in order to use models more orientated for tasks like this such as LSTM.
model = XGBRegressor(random_state = 42, n_estimators = 1000, learning_rate = .05,)
model.fit(X_train, y_train, eval_set = [(X_test, y_test)], verbose = False) 
trainPredict = model.predict(X_train)
testPredict = model.predict(X_test)

#inverse the scaling in order to get prices again
y_test = (y_scaler.inverse_transform(y_test)[:, [0]])
testPredict = (y_scaler.inverse_transform(testPredict.reshape(-1, 1))[:, [0]])
y_train = (y_scaler.inverse_transform(y_train)[:, [0]])
trainPredict = (y_scaler.inverse_transform(trainPredict.reshape(-1, 1))[:, [0]])

**Results & Comparison**

In [None]:
#Mean Squared Error of both test and train predictions
mse_test = mean_squared_error(y_test, testPredict)
print("Mean Squared Error of Test Data:" , mse_test)
mse_train = mean_squared_error(y_train, trainPredict)
print("Mean Squared Error of Train Data:", mse_train)

In [None]:
#copying the dates from the X train and test dataset
traindates = (X_train.index).copy()
testdates = (X_test.index).copy()
#function that changes the output from the model to a dataframe with the dates as the index.
def reformat_dates(dataset, dates):
    
    dataset = np.insert(dataset, 0, dates)
    dataset = dataset.reshape(-1, 2, order = 'F')
    dataset = pd.DataFrame(dataset, columns = ['Date', 'Close'])
    dataset.set_index("Date", inplace = True)
    dataset.index = pd.to_datetime(dataset.index)
    dataset = dataset.sort_index()
    return dataset
    
#reformat all datasets containing target values
trainPredict = reformat_dates(trainPredict, traindates)
y_train = reformat_dates(y_train, traindates)
testPredict = reformat_dates(testPredict, testdates)
y_test = reformat_dates(y_test, testdates)

In [None]:
#plotting the testing predicted vs the actual
plt.figure(figsize=(20,6))
plt.plot(pd.DataFrame(testPredict), label='Predicted', color = 'red')
plt.plot(pd.DataFrame(y_test), label='Actual', color = 'green')
plt.title('AAPL Stock Price Predictions vs Actual')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend(loc='best')
plt.show()

In [None]:
#taking a closer look at the differences, only last 100 entries in the dataset
plt.figure(figsize=(20,6))
plt.plot(pd.DataFrame(testPredict[-100:]), label='Predicted', color = 'red')
plt.plot(pd.DataFrame(y_test[-100:]), label='Actual', color = 'green')
plt.title('AAPL Stock Price Predictions vs Actual')
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend(loc='best')
plt.show()