### Import Libraries

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import keras_tuner as kt
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping
import tensorflow as tf
import joblib
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import os
import shutil

### Load and Normalize the Dataset

In [7]:
# Load the dataset
file_path = 'D:\\Github Mikezxc\\Big-data-stock-real-time-platform\\merged_data_with_ma.csv'
merged_df = pd.read_csv(file_path)

# Normalize data
price_scaler = MinMaxScaler()
merged_df[['close']] = price_scaler.fit_transform(merged_df[['close']])

feature_scaler = MinMaxScaler()
merged_df[['MA30', 'MA90']] = feature_scaler.fit_transform(merged_df[['MA30', 'MA90']])


### Prepare Sequences for LSTM

In [8]:
# Prepare sequences for LSTM
def create_sequences(df, time_steps=30):
    sequences = []
    labels = []
    for i in range(len(df) - time_steps):
        sequence = df[['close', 'MA30', 'MA90']].iloc[i:i+time_steps].values
        label = df['close'].iloc[i+time_steps]
        sequences.append(sequence)
        labels.append(label)
    return np.array(sequences), np.array(labels)


### Define the LSTM HyperModel

In [9]:
# Define the LSTM HyperModel
class LSTMHyperModel(kt.HyperModel):
    def build(self, hp):
        model = Sequential()
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(LSTM(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), 
                           return_sequences=(i != hp.Int('num_layers', 1, 3) - 1), input_shape=(time_steps, 3)))
        model.add(Dense(1))  # Output layer should match the number of features
        model.compile(optimizer=tf.keras.optimizers.Adam(
            hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3)),
            loss='mean_squared_error')
        return model


### Train and Evaluate the Model for Each Ticker

In [10]:
# Train and evaluate the model for each ticker
tickers = merged_df['ticker'].unique()

for ticker in tickers:
    ticker_df = merged_df[merged_df['ticker'] == ticker].dropna()
    
    # Create sequences and labels
    time_steps = 30
    X, y = create_sequences(ticker_df, time_steps)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Initialize RandomSearch Tuner
    tuner = kt.RandomSearch(
        LSTMHyperModel(),
        objective='val_loss',
        max_trials=20,
        executions_per_trial=1,
        directory='my_dir',
        project_name=f'lstm_stock_model_{ticker}'
    )

    # Perform hyperparameter search
    tuner.search(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    # Retrieve the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f"Ticker: {ticker}")
    print(f"The hyperparameter search is complete. The optimal number of layers is {best_hps.get('num_layers')}.")
    for i in range(best_hps.get('num_layers')):
        print(f"Layer {i + 1}: {best_hps.get(f'units_{i}')} units")
    print(f"The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.")

    # Build the model with the optimal hyperparameters
    model = tuner.hypermodel.build(best_hps)

    # Train the model
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    # Create directory for ticker if it doesn't exist
    os.makedirs(f'models/{ticker}', exist_ok=True)

    # Save the model
    model.save(f'models/{ticker}/lstm_stock_model_best_{ticker}.h5')

    # Save the scalers
    joblib.dump(price_scaler, f'models/{ticker}/{ticker}_price_scaler.pkl')
    joblib.dump(feature_scaler, f'models/{ticker}/{ticker}_feature_scaler.pkl')

    # Save the history for plotting
    with open(f'models/{ticker}/history_{ticker}.pkl', 'wb') as file:
        joblib.dump(history.history, file)

    # Make predictions
    y_pred = model.predict(X_test)

    # Inverse transform the predictions and the actual values
    y_pred = price_scaler.inverse_transform(y_pred)
    y_test = price_scaler.inverse_transform(y_test.reshape(-1, 1))

    # Save the predictions for plotting
    np.save(f'models/{ticker}/y_test_{ticker}.npy', y_test)
    np.save(f'models/{ticker}/y_pred_{ticker}.npy', y_pred)


Reloading Tuner from my_dir\lstm_stock_model_AAPL\tuner0.json
Ticker: AAPL
The hyperparameter search is complete. The optimal number of layers is 1.
Layer 1: 288 units
The optimal learning rate for the optimizer is 0.0020813994011954795.
Epoch 1/50


  super().__init__(**kwargs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 41ms/step - loss: 0.0013 - val_loss: 3.3460e-04
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 7.5810e-05 - val_loss: 2.0958e-04
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 6.5130e-05 - val_loss: 1.3651e-04
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - loss: 4.2135e-05 - val_loss: 1.0275e-04
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 4.0367e-05 - val_loss: 2.0218e-04
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 5.8996e-05 - val_loss: 9.7610e-05
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - loss: 4.1843e-05 - val_loss: 8.9712e-05
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - loss: 2.8758e-05 - val_loss: 3.7544e-04



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Reloading Tuner from my_dir\lstm_stock_model_GOOGL\tuner0.json
Ticker: GOOGL
The hyperparameter search is complete. The optimal number of layers is 3.
Layer 1: 32 units
Layer 2: 32 units
Layer 3: 64 units
The optimal learning rate for the optimizer is 0.0035065850163276276.
Epoch 1/50


  super().__init__(**kwargs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - loss: 0.0032 - val_loss: 4.5146e-04
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 9.2182e-05 - val_loss: 2.4638e-04
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 8.4431e-05 - val_loss: 4.1795e-04
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 1.2955e-04 - val_loss: 2.3654e-04
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - loss: 9.2540e-05 - val_loss: 3.1006e-04
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 7.1662e-05 - val_loss: 1.6592e-04
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - loss: 7.6987e-05 - val_loss: 1.5983e-04
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - loss: 9.5368e-05 - val_loss: 2.2212e-04



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step
Reloading Tuner from my_dir\lstm_stock_model_MSFT\tuner0.json
Ticker: MSFT
The hyperparameter search is complete. The optimal number of layers is 2.
Layer 1: 480 units
Layer 2: 32 units
The optimal learning rate for the optimizer is 0.004079496697488148.
Epoch 1/50


  super().__init__(**kwargs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 55ms/step - loss: 0.0866 - val_loss: 0.0052
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 44ms/step - loss: 3.1347e-04 - val_loss: 0.0034
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 2.6904e-04 - val_loss: 0.0029
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - loss: 2.3072e-04 - val_loss: 0.0023
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - loss: 2.0630e-04 - val_loss: 0.0011
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - loss: 3.0139e-04 - val_loss: 0.0024
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 42ms/step - loss: 2.3608e-04 - val_loss: 0.0011
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 43ms/step - loss: 1.7598e-04 - val_loss: 6.3337e-04
Epoch 9/50
[1m60/60[0m [



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step
Reloading Tuner from my_dir\lstm_stock_model_AMZN\tuner0.json
Ticker: AMZN
The hyperparameter search is complete. The optimal number of layers is 3.
Layer 1: 64 units
Layer 2: 192 units
Layer 3: 128 units
The optimal learning rate for the optimizer is 0.00390082207002489.


  super().__init__(**kwargs)


Epoch 1/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 64ms/step - loss: 0.0152 - val_loss: 5.4507e-04
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 63ms/step - loss: 3.0260e-04 - val_loss: 3.8514e-04
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - loss: 1.6479e-04 - val_loss: 1.7283e-04
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - loss: 1.6813e-04 - val_loss: 5.4747e-04
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - loss: 2.9304e-04 - val_loss: 2.4356e-04
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 71ms/step - loss: 1.4418e-04 - val_loss: 1.3281e-04
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - loss: 8.7106e-05 - val_loss: 3.0747e-04
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - loss: 2.2336e-04 - val_loss:



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step
Reloading Tuner from my_dir\lstm_stock_model_TSLA\tuner0.json
Ticker: TSLA
The hyperparameter search is complete. The optimal number of layers is 3.
Layer 1: 160 units
Layer 2: 352 units
Layer 3: 160 units
The optimal learning rate for the optimizer is 0.00034786388011680107.
Epoch 1/50


  super().__init__(**kwargs)


[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 140ms/step - loss: 0.0131 - val_loss: 0.0059
Epoch 2/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 136ms/step - loss: 0.0015 - val_loss: 0.0031
Epoch 3/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 135ms/step - loss: 0.0014 - val_loss: 0.0020
Epoch 4/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 123ms/step - loss: 0.0010 - val_loss: 0.0017
Epoch 5/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 123ms/step - loss: 8.6364e-04 - val_loss: 0.0015
Epoch 6/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 121ms/step - loss: 0.0011 - val_loss: 0.0031
Epoch 7/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 122ms/step - loss: 7.6213e-04 - val_loss: 0.0025
Epoch 8/50
[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 120ms/step - loss: 6.0977e-04 - val_loss: 0.0017
Epoch 9/50
[1m60/60[0m [32m━━━━━━━━



[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 78ms/step


### Plotting the Results

In [15]:
# Plotting the training and validation loss
def plot_training_validation_loss(ticker):
    with open(f'models/{ticker}/history_{ticker}.pkl', 'rb') as file:
        history = joblib.load(file)

    plt.figure(figsize=(14, 5))
    plt.plot(history['loss'], label='Training Loss')
    plt.plot(history['val_loss'], label='Validation Loss')
    plt.title(f'Training and Validation Loss for {ticker}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(f'models/{ticker}/training_validation_loss_{ticker}.png')  # Save the plot
    plt.close()

# Plotting the stock price prediction
def plot_stock_price_prediction(ticker):
    y_test = np.load(f'models/{ticker}/y_test_{ticker}.npy')
    y_pred = np.load(f'models/{ticker}/y_pred_{ticker}.npy')

    plt.figure(figsize=(14, 5))
    plt.plot(y_test, color='blue', label='Actual Stock Price')
    plt.plot(y_pred, color='red', label='Predicted Stock Price')
    plt.title(f'Stock Price Prediction for {ticker}')
    plt.xlabel('Time')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.savefig(f'models/{ticker}/stock_price_prediction_{ticker}.png')  # Save the plot
    plt.close()

# Plot results for each ticker
for ticker in tickers:
    plot_training_validation_loss(ticker)
    plot_stock_price_prediction(ticker)


# Load the dataset
file_path = 'D:\\Github Mikezxc\\Big-data-stock-real-time-platform\\merged_data_with_ma.csv'
merged_df = pd.read_csv(file_path)

# Normalize data
price_scaler = MinMaxScaler()
merged_df[['close']] = price_scaler.fit_transform(merged_df[['close']])

feature_scaler = MinMaxScaler()
merged_df[['MA30', 'MA90']] = feature_scaler.fit_transform(merged_df[['MA30', 'MA90']])

# Prepare sequences for LSTM
def create_sequences(df, time_steps=30):
    sequences = []
    labels = []
    for i in range(len(df) - time_steps):
        sequence = df[['close', 'MA30', 'MA90']].iloc[i:i+time_steps].values
        label = df['close'].iloc[i+time_steps]
        sequences.append(sequence)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Define the LSTM HyperModel
class LSTMHyperModel(kt.HyperModel):
    def build(self, hp):
        model = Sequential()
        for i in range(hp.Int('num_layers', 1, 3)):
            model.add(LSTM(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), 
                           return_sequences=(i != hp.Int('num_layers', 1, 3) - 1), input_shape=(time_steps, 3)))
        model.add(Dense(1))  # Output layer should match the number of features
        model.compile(optimizer=tf.keras.optimizers.Adam(
            hp.Float('learning_rate', min_value=1e-4, max_value=1e-2, sampling='LOG', default=1e-3)),
            loss='mean_squared_error')
        return model

# Train and evaluate the model for each ticker
tickers = merged_df['ticker'].unique()

for ticker in tickers:
    ticker_df = merged_df[merged_df['ticker'] == ticker].dropna()
    
    # Create sequences and labels
    time_steps = 30
    X, y = create_sequences(ticker_df, time_steps)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

    # Initialize RandomSearch Tuner
    tuner = kt.RandomSearch(
        LSTMHyperModel(),
        objective='val_loss',
        max_trials=20,
        executions_per_trial=1,
        directory='my_dir',
        project_name=f'lstm_stock_model_{ticker}'
    )

    # Perform hyperparameter search
    tuner.search(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    # Retrieve the best hyperparameters
    best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
    print(f"Ticker: {ticker}")
    print(f"The hyperparameter search is complete. The optimal number of layers is {best_hps.get('num_layers')}.")
    for i in range(best_hps.get('num_layers')):
        print(f"Layer {i + 1}: {best_hps.get(f'units_{i}')} units")
    print(f"The optimal learning rate for the optimizer is {best_hps.get('learning_rate')}.")

    # Build the model with the optimal hyperparameters
    model = tuner.hypermodel.build(best_hps)

    # Train the model
    history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=1, callbacks=[EarlyStopping(monitor='val_loss', patience=5)])

    # Save the model
    model.save(f'/mnt/data/lstm_stock_model_best_{ticker}.h5')

    # Save the scalers
    joblib.dump(price_scaler, f'/mnt/data/{ticker}_price_scaler.pkl')
    joblib.dump(feature_scaler, f'/mnt/data/{ticker}_feature_scaler.pkl')

    # Plot the training and validation loss
    plt.figure(figsize=(14, 5))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Training and Validation Loss for {ticker}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    # Make predictions
    y_pred = model.predict(X_test)

    # Inverse transform the predictions and the actual values
    y_pred = price_scaler.inverse_transform(y_pred)
    y_test = price_scaler.inverse_transform(y_test.reshape(-1, 1))

    # Plot the results
    plt.figure(figsize=(14, 5))
    plt.plot(y_test, color='blue', label='Actual Stock Price')
    plt.plot(y_pred, color='red', label='Predicted Stock Price')
    plt.title(f'Stock Price Prediction for {ticker}')
    plt.xlabel('Time')
    plt.ylabel('Stock Price')
    plt.legend()
    plt.show()