In [None]:
!pip install pandas-datareader==0.10.0



In [2]:
!pip install 'ray[tune]'

Collecting ray[tune]
  Downloading ray-2.30.0-cp310-cp310-manylinux2014_x86_64.whl (66.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX, ray
Successfully installed ray-2.30.0 tensorboardX-2.6.2.2


In [3]:
# data preprocessing
import datetime
import pandas as pd
import pandas_datareader.data as pdr
from sklearn.preprocessing import MinMaxScaler
import numpy
import seaborn as sns
import matplotlib.pyplot as plt


# machine learning
import torch
from torch.autograd import Variable
import torch.nn as nn

# hyperparameter tuning
import ray
from ray import tune

In [4]:

# Read the CSV file into a DataFrame
csv_file_path = '/content/merged_tsla_stock_data_with_sentiment.csv'
tsla_price_df = pd.read_csv(csv_file_path)

# Rename the column 'compound' to 'Sentiment Score'
tsla_price_df.rename(columns={'compound': 'Sentiment Score'}, inplace=True)

# Display the DataFrame
print(tsla_price_df)

FileNotFoundError: [Errno 2] No such file or directory: '/content/merged_tsla_stock_data_with_sentiment.csv'

In [None]:
# adding daily returns to stock price data
tsla_price_df['Return'] = tsla_price_df['Close'].pct_change()

In [None]:
# plotting parameters
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams['axes.grid'] = True
plt.rcParams.update({'font.size': 12})

In [None]:

start_time_all = datetime.datetime(2019,12,24)
end_time_all = datetime.datetime(2022, 3, 1)
plt.plot(tsla_price_df.index, tsla_price_df['Close'])
print(tsla_price_df['Close'])
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.xlim(start_time_all, end_time_all)
plt.title('TSLA Close Price')
plt.show()

In [None]:
# calculating correlation between daily returns and sentiment scores
tsla_price_df['Return'][1:].corr(tsla_price_df['Sentiment Score'][1:])

In [None]:
# plotting correlation between daily returns and sentiment scores
sns.regplot(x=tsla_price_df['Return'][1:], y=tsla_price_df['Sentiment Score'][1:], color='g')
plt.title('Correlation between Daily Returns and Sentiment Scores')
plt.show()

In [None]:
# # Read the CSV file into a DataFrame, ensuring the first column is read as dates
# csv_file_path = 'your_csv_file.csv'
# tsla_price_df = pd.read_csv(csv_file_path, parse_dates=[0])

# Rename the unnamed column to 'Date'
tsla_price_df.rename(columns={tsla_price_df.columns[0]: 'Date'}, inplace=True)

# Set the 'Date' column as the index
tsla_price_df.set_index('Date', inplace=True)

# Drop the 'Return' column if it exists
if 'Return' in tsla_price_df.columns:
    tsla_price_df.drop(columns=['Return'], inplace=True)

# Features (High price, Low price, Open price, Volume, Sentiment Score)
X = tsla_price_df.drop(['Close'], axis=1)

# Response (Close price)
y = tsla_price_df['Close']

# Display the features and response DataFrames
print(X)
print(y)

In [None]:
n_split = 400 # where to split training and validation datasets

In [None]:
# Reshape y for scaling
y = y.values.reshape(-1, 1)

# Normalizing datasets
scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_norm = scaler_X.fit_transform(X)
y_norm = scaler_y.fit_transform(y)

# Split index for training and validation
n_split = len(tsla_price_df[tsla_price_df.index < '2020-10-01'])

# Training using Jan - Sep data
# Validating trained model using Oct - Dec data
X_train = X_norm[:n_split, :]
y_train = y_norm[:n_split, :]

X_val = X_norm[n_split:, :]
y_val = y_norm[n_split:, :]

print('Training Set Shape', X_train.shape, y_train.shape)
print('Validation Set Shape', X_val.shape, y_val.shape)

In [None]:
# converting data to tensors
X_train_tensors = Variable(torch.Tensor(X_train))
y_train_tensors = Variable(torch.Tensor(y_train))

X_val_tensors = Variable(torch.Tensor(X_norm))
y_true = numpy.array(y)

# reshaping X dataset
X_train_tensors = torch.reshape(X_train_tensors, (X_train_tensors.shape[0],
                                                  1, X_train_tensors.shape[1]))
X_val_tensors = torch.reshape(X_val_tensors, (X_val_tensors.shape[0],
                                              1, X_val_tensors.shape[1]))

print('Training Set Shape   ', X_train_tensors.shape, y_train_tensors.shape)
print('Validation Set Shape ', X_val_tensors.shape, y_true.shape)

In [None]:
# selecting gpu
device = torch.device('cuda')

In [None]:
class LSTM_model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM_model, self).__init__()
        self.input_size = input_size # number of features
        self.hidden_size = hidden_size # number of features in hidden state
        self.num_layers = num_layers # number of stacked LSTM layers
        self.num_classes = num_classes # number of output classes

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=True) # LSTM layer
        self.fc = nn.Linear(hidden_size, num_classes) # fully connected last layer

    def forward(self, x):
        h0 = Variable(torch.zeros(self.num_layers, x.size(0),
                                  self.hidden_size)).to(device) # hidden state
        c0 = Variable(torch.zeros(self.num_layers, x.size(0),
                                  self.hidden_size)).to(device) # internal state
        # propagating input through LSTM
        out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (seq_length, batch_size, hidden_size)
        # decoding hidden state of last time step
        out = self.fc(out[:, -1, :])
        return out

In [None]:
# starting ray runtime
ray.init()

In [None]:
# checking available resources
ray.cluster_resources()

In [None]:
# function for calculating MSE (Mean Squared Error)
def calc_mse(y_true, y_hat):
    mse = float(sum((y_true - y_hat) * (y_true - y_hat)) / len(y_hat))
    return mse

In [None]:
# function for fitting and evaluating model using different hyperparameters
def model_fit_eval(config=None, best_config=None, save_model_state=False):
    if best_config is None:
        hidden_size = config['hidden_size']
        num_layers = config['num_layers']
        learning_rate = config['learning_rate']
        # learning rate controls how much to change model in response to estm error each time model weights are updated
        num_epochs = config['num_epochs']
    else:
        hidden_size = best_config['hidden_size']
        num_layers = best_config['num_layers']
        learning_rate = best_config['learning_rate']
        num_epochs = best_config['num_epochs']

    input_size = 6
    num_classes = 1

    model = LSTM_model(input_size, hidden_size, num_layers, num_classes).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) # adam optimizer
    # algorithms/methods to change attributes of neural network such as weights and learning rate to reduce losses

    loss_function = torch.nn.MSELoss() # mean-squared error of regression
    # loss function measures how bad model performs: high loss -> low accuracy

    for epoch in range(num_epochs + 1):
        outputs = model.forward(X_train_tensors.to(device)) # forward pass
        optimizer.zero_grad() # calculating gradient, manually setting to 0
        loss = loss_function(outputs, y_train_tensors.to(device)) # obtaining loss
        loss.backward() # calculating loss of loss function
        optimizer.step() # improving from loss, i.e. backprop
        if best_config is not None:
            if epoch % 1000 == 0:
                print(f'Epoch: {epoch}, Loss: {loss.item():.4f}')

    # saving model state
    if save_model_state:
        torch.save(model.state_dict(), '4-model-state.pth')

    # loading model state
    # model = LSTM_model(*args, **kwargs)
    # model.load_state_dict(torch.load(PATH))
    # model.eval()

    # predicting response from model
    y_hat = model(X_val_tensors.to(device)) # forward pass
    y_hat = y_hat.data.detach().cpu().numpy() # numpy conversion
    y_hat = scaler.inverse_transform(y_hat) # inverse transformation

    if best_config is None:
        mse = calc_mse(y_true, y_hat)
        tune.report(mse=mse)
    else:
        return y_hat

In [None]:
# USE SSA HERE!!

# # using grid search to get best hyperparameters
# analysis = tune.run(
#     model_fit_eval,
#     config={'hidden_size': tune.grid_search([2, 3, 5]),
#             'learning_rate': tune.grid_search([0.0005, 0.001, 0.002]),
#             'num_epochs': tune.grid_search([2000, 4000, 8000]),
#             'num_layers': tune.grid_search([1, 2, 4])},
#     resources_per_trial={'cpu': 2, 'gpu': 1}) # leveraging all resources

#   # visualizing hyperparameter tuning results
# %load_ext tensorboard
# %tensorboard --logdir ~/ray_results

# # best hyperparameters
# best_trial = analysis.get_best_trial(metric='mse', mode='min', scope='all')
# min_mse = best_trial.metric_analysis['mse']['avg']
# print(f'Min MSE: {min_mse}')
# best_trial_config = best_trial.config
# print(f'Best trial config: {best_trial_config}')

# # training model using best hyperparameters
# best_trial_config['num_epochs'] = 10000 # increasing num of epochs
# y_hat = model_fit_eval(best_config=best_trial_config, save_model_state=True)

# # shutting down ray runtime
# ray.shutdown()


In [None]:
# function for plotting actual and predicted Adj Close price
y_hat=[]
def plt_graph(start_time, end_time, period, val_line=False):
    # plotting actual Adj Close price
    plt.plot(tsla_price_df.index, y_true, label='Actual Price')
    # plotting predicted Adj Close price
    plt.plot(tsla_price_df.index, y_hat, label='Pred Price')
    if val_line:
        plt.axvline(x=start_time_val, c='r', linestyle='--')
    plt.xlabel('Date')
    plt.ylabel('Adj Close Price')
    plt.xlim(start_time, end_time)
    plt.title(f'{period} Period')
    plt.legend()
    plt.show()

In [None]:
# # plotting all period
# period = 'All'
# plt_graph(start_time_all, end_time_all, period, True)

In [None]:
def calc_mse(y_true, y_hat):
    if len(y_true) != len(y_hat):
        raise ValueError("Length of y_true and y_hat must be the same.")

    mse = np.mean((y_true - y_hat) ** 2)
    return mse

In [None]:
# mse = calc_mse(y_true, y_hat)                     ### AFTER IMPLEMENTING SSA
# print(f'MSE ({period}): {mse}')

In [None]:
# # plotting training period
# period = 'Training'
# plt_graph(start_time_all, end_time_train, period)

In [None]:
# mse = calc_mse(y_true[:n_split], y_hat[:n_split])
# print(f'MSE ({period}): {mse}')

In [None]:
# # plotting validation period
# period = 'Validation'
# plt_graph(start_time_val, end_time_all, period)

In [None]:
# mse = calc_mse(y_true[n_split:], y_hat[n_split:])
# print(f'MSE ({period}): {mse}')