In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from IPython.display import display

In [24]:
# Load data
data = pd.read_excel('AAPL_intraday_data.xlsx', parse_dates=['Dates'])

# Feature Engineering
data['Mid Price'] = (data['Bid Price'] + data['Ask Price']) / 2
data['Log Return'] = np.log(data['Last Price'] / data['Last Price'].shift(1))

# Drop NaN values from Log Return calculation
data = data.dropna()

# Add lag features
for lag in range(1, 6):
    data[f'Lag_{lag}'] = data['Last Price'].shift(lag)

# Drop NaN values created by lag features
data = data.dropna()

In [25]:
# Train-test split
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=False)

# Prepare final datasets
X_train = train_data[['Volume', 'Bid Price', 'Ask Price', 'Bid-Ask Spread', 'Turnover']]
y_train = train_data['Last Price']
X_test = test_data[['Volume', 'Bid Price', 'Ask Price', 'Bid-Ask Spread', 'Turnover']]
y_test = test_data['Last Price']

# Model Training
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)

# Predictions
lr_predictions = lr_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# Evaluation
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_rmse = np.sqrt(mean_squared_error(y_test, lr_predictions))
lr_r2 = r2_score(y_test, lr_predictions)

rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))
rf_r2 = r2_score(y_test, rf_predictions)



In [26]:
# Print evaluation metrics
print("Linear Regression Model Performance:")
print(f"Mean Absolute Error (MAE): {lr_mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {lr_rmse:.4f}")
print(f"R-squared (R2): {lr_r2:.4f}")
print()
print("Random Forest Model Performance:")
print(f"Mean Absolute Error (MAE): {rf_mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rf_rmse:.4f}")
print(f"R-squared (R2): {rf_r2:.4f}")
print()

# Compare models and print the better one
if lr_r2 > rf_r2:
    print("The Linear Regression model performed better than the Random Forest model.")
    print("This is because it has a higher R-squared value, indicating it explains more of the variance in the stock prices.")
    print("It also has lower MAE and RMSE values, suggesting it makes more accurate predictions on average.")
else:
    print("The Random Forest model performed better than the Linear Regression model.")
    print("This is because it has a higher R-squared value, indicating it explains more of the variance in the stock prices.")
    print("It also has lower MAE and RMSE values, suggesting it makes more accurate predictions on average.")

Linear Regression Model Performance:
Mean Absolute Error (MAE): 16.1775
Root Mean Squared Error (RMSE): 19.4043
R-squared (R2): -42.8038

Random Forest Model Performance:
Mean Absolute Error (MAE): 17.3909
Root Mean Squared Error (RMSE): 17.7007
R-squared (R2): -35.4500

The Random Forest model performed better than the Linear Regression model.
This is because it has a higher R-squared value, indicating it explains more of the variance in the stock prices.
It also has lower MAE and RMSE values, suggesting it makes more accurate predictions on average.


In [27]:
# Extract data for the first date in the test set
first_test_date = test_data['Dates'].max()
first_test_day_data = test_data[test_data['Dates'] == first_test_date]


# Prepare the dataset for prediction
X_first_test_day = first_test_day_data[['Volume', 'Bid Price', 'Ask Price', 'Bid-Ask Spread', 'Turnover']]
y_first_test_day = first_test_day_data['Last Price']

# Predict using both models
lr_first_test_day_predictions = lr_model.predict(X_first_test_day)
rf_first_test_day_predictions = rf_model.predict(X_first_test_day)

# Combine the actual and predicted prices in a DataFrame
results_df = pd.DataFrame({
    'Date': first_test_day_data['Dates'].dt.strftime('%m/%d/%Y'),
    'Time': first_test_day_data['Dates'].dt.strftime('%H:%M:%S'),
    'Actual Price': y_first_test_day,
    'Linear Regression Prediction': lr_first_test_day_predictions,
    'Random Forest Prediction': rf_first_test_day_predictions,
    'Random Forest Error': y_first_test_day - rf_first_test_day_predictions,
    'Linear Regression Error': y_first_test_day - lr_first_test_day_predictions
})

# Display the results in a tabular format
display(results_df)

Unnamed: 0,Date,Time,Actual Price,Linear Regression Prediction,Random Forest Prediction,Random Forest Error,Linear Regression Error
14089,06/21/2024,16:29:00,207.49,196.019182,195.55216,11.93784,11.470818
