In [None]:
! pip install tensorflow



In [None]:
! pip install yfinance



# Give all the stock symbols as a List ( Information Technology stock )



In [None]:
stock_symbols = ["AAPL", "MSFT", "NVDA", "ADBE", "CRM", "ORCL", "CSCO", "INTC", "IBM", "QCOM"]

"""
AAPL - Apple Inc.
MSFT - Microsoft Corporation
NVDA - NVIDIA Corporation
ADBE - Adobe Inc.
CRM - Salesforce, Inc.
ORCL - Oracle Corporation
CSCO - Cisco Systems, Inc.
INTC - Intel Corporation
IBM - International Business Machines Corporation
QCOM - Qualcomm Incorporated
"""

'\nAAPL - Apple Inc.\nMSFT - Microsoft Corporation\nNVDA - NVIDIA Corporation\nADBE - Adobe Inc.\nCRM - Salesforce, Inc.\nORCL - Oracle Corporation\nCSCO - Cisco Systems, Inc.\nINTC - Intel Corporation\nIBM - International Business Machines Corporation\nQCOM - Qualcomm Incorporated\n'

# Import all libraries

In [None]:
import os
import json
import numpy as np
import pandas as pd
import yfinance as yf
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from math import sqrt
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Input, Bidirectional
from tensorflow.keras.optimizers import Adam, Adagrad, Nadam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import GlorotUniform

# Step 1 : Download stock data and store in ".csv" format

In [None]:
def download_stock_data(symbols, start_date, end_date, directory):
    """
    Downloads stock data for given symbols and stores them as CSV files.

    Args:
        symbols (list): List of stock symbols to download.
        start_date (str): Start date for data download in 'YYYY-MM-DD' format.
        end_date (str): End date for data download in 'YYYY-MM-DD' format.
        directory (str): Directory to save the CSV files.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)
    for symbol in symbols:
        data = yf.download(symbol, start=start_date, end=end_date)
        file_path = os.path.join(directory, f"{symbol}.csv")
        data.to_csv(file_path)

# Step 2 : Read the stock data from the directory

In [None]:
def read_stock_data(directory, symbols):
    """
    Reads stock data from CSV files and returns it as a dictionary of DataFrames.

    Args:
        directory (str): Directory where the CSV files are stored.
        symbols (list): List of stock symbols to read.

    Returns:
        dict: Dictionary with stock symbols as keys and corresponding DataFrames as values.
    """
    stock_data = {}
    for symbol in symbols:
        file_path = os.path.join(directory, f"{symbol}.csv")
        stock_data[symbol] = pd.read_csv(file_path)
    return stock_data

# Step 3 : Data Preprocessing

In [None]:
def preprocess_data(data, window_size):
    """
    Preprocesses the stock data for training.

    Args:
        data (DataFrame): Stock data containing a 'Close' column.
        window_size (int): Size of the window to create sequences.

    Returns:
        tuple: Tuple containing the processed input data, target data, and the scaler used.
    """
    data = data[['Close']]
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(data)

    X, y = [], []
    for i in range(window_size, len(scaled_data)):
        X.append(scaled_data[i - window_size:i, 0])
        y.append(scaled_data[i, 0])

    X, y = np.array(X), np.array(y)
    return X, y, scaler

def split_data(X, y, train_size=0.7, val_size=0.15):
    """
    Splits the data into training, validation, and test sets.

    Args:
        X (ndarray): Input data.
        y (ndarray): Target data.
        train_size (float): Proportion of data to be used for training.
        val_size (float): Proportion of data to be used for validation.

    Returns:
        tuple: Tuple containing the training, validation, and test sets.
    """
    train_index = int(len(X) * train_size)
    val_index = train_index + int(len(X) * val_size)

    X_train, y_train = X[:train_index], y[:train_index]
    X_val, y_val = X[train_index:val_index], y[train_index:val_index]
    X_test, y_test = X[val_index:], y[val_index:]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

# Step 4 : Build LSTM model

In [None]:
def build_model(sequence_length):
    """
    Builds a Sequential model with LSTM and Dense layers.

    Args:
        sequence_length (int): Length of the input sequences.

    Returns:
        model: Compiled Sequential model.
    """
    model = Sequential()
    model.add(Input(shape=(sequence_length, 1)))

    # LSTM layers
    model.add(Bidirectional(LSTM(units=150, return_sequences=True,
                                 kernel_initializer=GlorotUniform(),
                                 recurrent_initializer=GlorotUniform(),
                                 activation='tanh')))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=150, return_sequences=True,
                                 kernel_initializer=GlorotUniform(),
                                 recurrent_initializer=GlorotUniform(),
                                 activation='tanh')))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(units=150,
                                 kernel_initializer=GlorotUniform(),
                                 recurrent_initializer=GlorotUniform(),
                                 activation='tanh')))
    model.add(Dropout(0.2))

    # Dense layers
    model.add(Dense(units=64, activation='relu', kernel_initializer=GlorotUniform()))
    model.add(Dense(units=32, activation='relu', kernel_initializer=GlorotUniform()))
    model.add(Dense(units=1, activation='linear', kernel_initializer=GlorotUniform()))

    return model


# Step 5 : Perform hyperparameter tuning

In [None]:
def hyperparameter_tuning(X_train, y_train, X_val, y_val, X_test, y_test,
                          sequence_length, results_dir, symbol):
    """
    Tunes the model hyperparameters using different optimizers, learning rates,
    and batch sizes. Records the results for each combination.

    Args:
        X_train (ndarray): Training input data.
        y_train (ndarray): Training target data.
        X_val (ndarray): Validation input data.
        y_val (ndarray): Validation target data.
        X_test (ndarray): Test input data.
        y_test (ndarray): Test target data.
        sequence_length (int): Length of the input sequences.
        results_dir (str): Directory to save the hyperparameter tuning results.
        symbol (str): Stock symbol being processed.

    Returns:
        dict: Dictionary containing RMSE values for each combination of hyperparameters.
    """
    optimizers = [Adam, Adagrad, Nadam]
    learning_rates = [0.01, 0.001, 0.0001]
    batch_sizes = [50, 100, 150]
    n_replicates = 3

    # Define the EarlyStopping callback
    early_stopping = EarlyStopping(
        monitor='val_loss',            # Monitor the validation loss
        patience=10,                   # Stop after 10 epochs with no improvement
        verbose=1,                     # Verbose output when stopping early
        restore_best_weights=True      # Restore the model weights from the epoch with the best validation loss
    )

    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    results = {}

    # Iterate through all combinations of optimizers, learning rates, and batch sizes
    for optimizer in optimizers:
        for lr in learning_rates:
            for batch_size in batch_sizes:
                rmse_list = []
                for _ in range(n_replicates):
                    model = build_model(sequence_length)
                    optimizer_instance = optimizer(learning_rate=lr)
                    model.compile(optimizer=optimizer_instance, loss='mean_squared_error')
                    model.fit(X_train, y_train, epochs=200, batch_size=batch_size,
                              validation_data=(X_val, y_val),
                              callbacks=[early_stopping], verbose=0)
                    y_pred = model.predict(X_test)
                    rmse = sqrt(mean_squared_error(y_test, y_pred))
                    rmse_list.append(rmse)

                avg_rmse = np.mean(rmse_list)
                key = f"Optimizer:{optimizer.__name__}_LR:{lr}_BatchSize:{batch_size}"
                results[key] = avg_rmse

    json_file = os.path.join(results_dir, f'{symbol}_results.json')
    with open(json_file, 'w') as f:
        json.dump(results, f, indent=4)

    return results

# Step 6 : Extract the best hyperparameters from the results

In [None]:
def get_best_hyperparameters(results):
    """
    Extracts the best hyperparameters based on the lowest RMSE value.

    Args:
        results (dict): Dictionary containing RMSE values for each combination of hyperparameters.

    Returns:
        tuple: Best optimizer, learning rate, and batch size.
    """
    best_result = min(results, key=results.get)
    best_optimizer_name = best_result.split('_')[0].split(':')[1]
    best_lr = float(best_result.split('_')[1].split(':')[1])
    best_batch_size = int(best_result.split('_')[2].split(':')[1])

    # Select the appropriate optimizer
    if best_optimizer_name == 'Adam':
        best_optimizer = Adam(learning_rate=best_lr)
    elif best_optimizer_name == 'Adagrad':
        best_optimizer = Adagrad(learning_rate=best_lr)
    elif best_optimizer_name == 'Nadam':
        best_optimizer = Nadam(learning_rate=best_lr)
    else:
        raise ValueError(f"Unsupported optimizer: {best_optimizer_name}")

    return best_optimizer, best_batch_size

# Step 7 : Train the model with best Hyperparameters

In [None]:
def train_best_model(X_train, y_train, X_val, y_val, sequence_length, best_optimizer, best_batch_size):
    """
    Trains the model using the best hyperparameters obtained from tuning.

    Args:
        X_train (ndarray): Training input data.
        y_train (ndarray): Training target data.
        X_val (ndarray): Validation input data.
        y_val (ndarray): Validation target data.
        sequence_length (int): Length of the input sequences.
        best_optimizer (Optimizer): Best optimizer selected from tuning.
        best_batch_size (int): Best batch size selected from tuning.

    Returns:
        tuple: Trained model and training history.
    """
    model = build_model(sequence_length)
    model.compile(optimizer=best_optimizer, loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

    history = model.fit(X_train, y_train, epochs=200, batch_size=best_batch_size,
                        validation_data=(X_val, y_val),
                        verbose=1, callbacks=[early_stopping])

    return model, history

# Step 8 : Plot the Learning curves

In [None]:
def plot_learning_curve(history):
    """
    Plots the learning curves for training and validation loss.

    Args:
        history (History): Training history obtained from model fitting.
    """
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Learning Curve')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

# Step 9 : Evaluation of Test set

In [None]:
def evaluate_model(model, X_test, y_test, scaler, metrics_dir, symbol):
    """
    Evaluates the model on test data and saves the evaluation metrics.

    Args:
        model (Sequential): Trained model.
        X_test (ndarray): Test input data.
        y_test (ndarray): Test target data.
        scaler (MinMaxScaler): Scaler used during preprocessing to rescale data.
        metrics_dir (str): Directory to save the evaluation metrics.
        symbol (str): Stock symbol being processed.

    Returns:
        dict: Dictionary containing evaluation metrics such as MSE, RMSE, MAE, and MAPE.
    """
    y_pred = model.predict(X_test)
    y_pred_rescaled = scaler.inverse_transform(y_pred)
    y_test_rescaled = scaler.inverse_transform(y_test.reshape(-1, 1))

    mse = mean_squared_error(y_test_rescaled, y_pred_rescaled)
    rmse = sqrt(mse)
    mae = mean_absolute_error(y_test_rescaled, y_pred_rescaled)
    mape = np.mean(np.abs((y_test_rescaled - y_pred_rescaled) / y_test_rescaled)) * 100

    metrics = {
        "MSE": mse,
        "RMSE": rmse,
        "MAE": mae,
        "MAPE": mape
    }

    if not os.path.exists(metrics_dir):
        os.makedirs(metrics_dir)

    json_file = os.path.join(metrics_dir, f'{symbol}_evaluation_metrics.json')
    with open(json_file, 'w') as f:
        json.dump(metrics, f, indent=4)

    return metrics

# Step 10 : Retrain the model on combined dataset

In [None]:
def retrain_on_full_data(data, sequence_length, best_optimizer, best_batch_size):
    """
    Retrains the model on the full dataset using the best hyperparameters.

    Args:
        data (DataFrame): Full stock data.
        sequence_length (int): Length of the input sequences.
        best_optimizer (Optimizer): Best optimizer selected from tuning.
        best_batch_size (int): Best batch size selected from tuning.

    Returns:
        tuple: Retrained model and training history.
    """
    X, y, _ = preprocess_data(data, sequence_length)
    model = build_model(sequence_length)
    model.compile(optimizer=best_optimizer, loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='loss', patience=20, restore_best_weights=True)

    history = model.fit(X, y, epochs=200, batch_size=best_batch_size, verbose=1, callbacks=[early_stopping])

    return model, history

# Step 11 : Saving the model

In [None]:
def save_trained_model(model, model_dir, symbol):
    """
    Saves the trained model to a specified directory.

    Args:
        model (Sequential): Trained model to be saved.
        model_dir (str): Directory where the model should be saved.
        symbol (str): Stock symbol to use in the filename.
    """
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model.save(os.path.join(model_dir, f"{symbol}_model.h5"))

# Step 12 : Main Execution flow

In [None]:
data_dir = "stock_data"              # Directory to store stock data
results_dir = "hyperparameter_results"  # Directory to store hyperparameter tuning results
metrics_dir = "evaluation_metrics"   # Directory to store evaluation metrics
model_dir = "trained_models"         # Directory to save trained models
start_date = '2010-08-01'            # Start date for stock data download
end_date = '2023-12-31'              # End date for stock data download
sequence_length = 60                 # Length of the input sequences

# Download and preprocess stock data
download_stock_data(stock_symbols, start_date, end_date, data_dir)
stock_data = read_stock_data(data_dir, stock_symbols)

# Process each stock symbol
for symbol in stock_symbols:
    print(f"Processing stock: {symbol}")
    data = stock_data[symbol]
    X, y, scaler = preprocess_data(data, sequence_length)
    (X_train, y_train), (X_val, y_val), (X_test, y_test) = split_data(X, y)

    # Perform hyperparameter tuning
    results = hyperparameter_tuning(X_train, y_train, X_val, y_val, X_test, y_test, sequence_length, results_dir, symbol)

    # Get the best hyperparameters and train the model
    best_optimizer, best_batch_size = get_best_hyperparameters(results)
    model, history = train_best_model(X_train, y_train, X_val, y_val, sequence_length, best_optimizer, best_batch_size)

    # Plot the learning curve and evaluate the model
    plot_learning_curve(history)
    evaluate_model(model, X_test, y_test, scaler, metrics_dir, symbol)

    # Retrain the model on the full dataset and save it
    best_optimizer, best_batch_size = get_best_hyperparameters(results)
    final_model, final_history = retrain_on_full_data(data, sequence_length, best_optimizer, best_batch_size)
    save_trained_model(final_model, model_dir, symbol)

    print(f"Completed processing for stock: {symbol}")


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Processing stock: AAPL
Epoch 27: early stopping
Restoring model weights from the end of the best epoch: 17.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 318ms/step
Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 253ms/step
Epoch 43: early stopping
Restoring model weights from the end of the best epoch: 33.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 263ms/step
Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 265ms/step
Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 253ms/step
Epoch 10: early stopping
Restoring model weights from the end of the best epoch: 1.
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 257ms/step
Epoch 10: early stopp