# Installing and Importing the required Library for yfinance data

In [1]:

!pip install yfinance --quiet

In [2]:

import yfinance as yf
import pandas as pd

In [3]:

def fetch_stock_data(ticker="SPY", period="3y", interval="1d"):
    print(f"Fetching data for {ticker}...")
    stock_data = yf.download(ticker, period=period, interval=interval)
    print(f"Data fetched: {len(stock_data)} rows.")
    return stock_data


def save_to_csv(data, filename="stock_data.csv"):
    data.to_csv(filename)
    print(f"Data saved to {filename}")

spy_data = fetch_stock_data()
save_to_csv(spy_data)


spy_data.head()


Fetching data for SPY...


  stock_data = yf.download(ticker, period=period, interval=interval)
[*********************100%***********************]  1 of 1 completed

Data fetched: 753 rows.
Data saved to stock_data.csv





Price,Close,High,Low,Open,Volume
Ticker,SPY,SPY,SPY,SPY,SPY
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2022-08-09,394.24707,395.588856,393.164048,395.080893,44931800
2022-08-10,402.527863,402.671649,399.393831,401.368179,68665700
2022-08-11,402.527863,407.281659,401.780294,405.40313,59489700
2022-08-12,409.342255,409.447667,403.524623,404.483046,61694500
2022-08-15,411.029114,411.556264,407.051666,407.109169,54048300


In [None]:

from sklearn.preprocessing import MinMaxScaler

In [4]:

print("Checking for missing values...")
print(spy_data.isnull().sum())
spy_data = spy_data.dropna()

Checking for missing values...
Price   Ticker
Close   SPY       0
High    SPY       0
Low     SPY       0
Open    SPY       0
Volume  SPY       0
dtype: int64


In [5]:

selected_features = ['Open', 'High', 'Low', 'Close', 'Volume']
processed_data = spy_data[selected_features]


processed_data['Daily Return'] = processed_data['Close'].pct_change()
processed_data = processed_data.dropna()  # Remove NaN values caused by pct_change()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data['Daily Return'] = processed_data['Close'].pct_change()


In [7]:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalized_data = pd.DataFrame(scaler.fit_transform(processed_data),
                               columns=processed_data.columns,
                               index=processed_data.index)

# Save the preprocessed data to a CSV file for later use
normalized_data.to_csv("preprocessed_stock_data.csv")
print("Preprocessed data saved to preprocessed_stock_data.csv")

# Display a preview of the normalized data
normalized_data.head()

Preprocessed data saved to preprocessed_stock_data.csv


Price,Open,High,Low,Close,Volume,Daily Return
Ticker,SPY,SPY,SPY,SPY,SPY,Unnamed: 6_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2022-08-10,0.21524,0.1921,0.214242,0.201975,0.184839,0.486341
2022-08-11,0.228539,0.207803,0.222183,0.201975,0.145041,0.357924
2022-08-12,0.225507,0.215181,0.227987,0.22515,0.154603,0.461426
2022-08-15,0.234163,0.222363,0.239723,0.230887,0.12144,0.383119
2022-08-16,0.243513,0.229937,0.246643,0.233625,0.14417,0.369899


In [None]:

import numpy as np

In [8]:

def calculate_rsi(data, window=14):
    delta = data['Close'].diff(1)
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)


    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi

normalized_data['RSI'] = calculate_rsi(normalized_data)

def calculate_macd(data, short_window=12, long_window=26, signal_window=9):
    short_ema = data['Close'].ewm(span=short_window, adjust=False).mean()
    long_ema = data['Close'].ewm(span=long_window, adjust=False).mean()
    macd = short_ema - long_ema
    signal_line = macd.ewm(span=signal_window, adjust=False).mean()
    return macd, signal_line

normalized_data['MACD'], normalized_data['Signal_Line'] = calculate_macd(processed_data)

def calculate_bollinger_bands(data, window=20):
    sma = data['Close'].rolling(window=window).mean()  # Simple Moving Average
    std = data['Close'].rolling(window=window).std()
    upper_band = sma + (2 * std)
    lower_band = sma - (2 * std)
    return upper_band, lower_band

normalized_data['Upper_Band'], normalized_data['Lower_Band'] = calculate_bollinger_bands(processed_data)

In [9]:

normalized_data = normalized_data.dropna()

In [10]:
# Save the feature-engineered data to a CSV file
normalized_data.to_csv("feature_engineered_data.csv")
print("Feature-engineered data saved to feature_engineered_data.csv")

# Display a preview of the dataset with new features
normalized_data.head()


Feature-engineered data saved to feature_engineered_data.csv


Price,Open,High,Low,Close,Volume,Daily Return,RSI,MACD,Signal_Line,Upper_Band,Lower_Band
Ticker,SPY,SPY,SPY,SPY,SPY,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2022-09-07,0.125683,0.121746,0.129668,0.129582,0.194808,0.467761,26.78897,-6.913493,-4.727962,420.785796,368.928496
2022-09-08,0.141351,0.129157,0.142169,0.138056,0.237562,0.397886,28.355621,-6.55318,-5.093006,420.037142,367.797682
2022-09-09,0.16457,0.150867,0.168766,0.158331,0.219716,0.452906,37.863195,-5.720648,-5.218534,419.134063,367.417435
2022-09-12,0.18365,0.164644,0.1879,0.172574,0.187401,0.423633,48.003312,-4.669083,-5.108644,417.238796,367.766761
2022-09-13,0.161695,0.13647,0.135153,0.114328,0.420269,0.092079,37.413375,-5.158248,-5.118565,414.860902,366.717335


In [11]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [14]:
# Step 4: Prepare data for LSTM
window_length = 60 # Define the window length

X = []
y = []

for i in range(window_length, len(normalized_data)):
    X.append(normalized_data.iloc[i - window_length:i, :-1].values) # Use iloc to access rows and .values to get numpy array
    y.append(normalized_data.iloc[i, 3]) # Use iloc to access rows

X = np.array(X)
y = np.array(y)

# Reshape X for feature scaling
X_reshaped = X.reshape(-1, X.shape[2])

feature_scaler = MinMaxScaler(feature_range=(0, 1))
X_scaled = feature_scaler.fit_transform(X_reshaped).reshape(X.shape)  # Reshape back to original shape

target_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaled = target_scaler.fit_transform(y.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, shuffle=False)

In [15]:

model = Sequential()

model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(units=1))

  super().__init__(**kwargs)


In [16]:
# 5. Compile the model with a suitable optimizer (Adam) and add MAE as a metric
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# 6. Train the model
# Step 5: Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 96ms/step - loss: 0.1012 - mae: 0.2405 - val_loss: 0.0053 - val_mae: 0.0605
Epoch 2/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 101ms/step - loss: 0.0092 - mae: 0.0721 - val_loss: 0.0093 - val_mae: 0.0861
Epoch 3/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 74ms/step - loss: 0.0060 - mae: 0.0604 - val_loss: 0.0073 - val_mae: 0.0751
Epoch 4/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 69ms/step - loss: 0.0049 - mae: 0.0529 - val_loss: 0.0088 - val_mae: 0.0833
Epoch 5/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0046 - mae: 0.0524 - val_loss: 0.0062 - val_mae: 0.0683
Epoch 6/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 67ms/step - loss: 0.0051 - mae: 0.0536 - val_loss: 0.0048 - val_mae: 0.0590
Epoch 7/50
[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 65ms/step - loss: 

<keras.src.callbacks.history.History at 0x7e59e899a210>

In [17]:

predictions = model.predict(X_test)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 92ms/step


In [18]:

predictions_reshaped = predictions.reshape(-1, 1)
predicted_prices = target_scaler.inverse_transform(predictions_reshaped)  # Reverse the scaling for the target

In [19]:
from sklearn.metrics import mean_squared_error
import math
# Step 8: Evaluate the model
rmse = math.sqrt(mean_squared_error(target_scaler.inverse_transform(y_test.reshape(-1, 1)), predicted_prices))
print(f"Root Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 0.05594441192630748


# Comparing our model to other predictive models

Random Forest Model of similar complexity

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Step 4.1: Train Random Forest (RF)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten X_train for RF


  return fit_method(estimator, *args, **kwargs)


In [25]:
# Make predictions
rf_predictions = rf_model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten X_test for RF

# Inverse transform the predictions
rf_predicted_prices = target_scaler.inverse_transform(rf_predictions.reshape(-1, 1))

# Evaluate the model
rf_rmse = math.sqrt(mean_squared_error(target_scaler.inverse_transform(y_test.reshape(-1, 1)), rf_predicted_prices))
print(f"Random Forest RMSE: {rf_rmse}")

Random Forest RMSE: 0.06766974417551948


Support Vector Machine with simialr complexity

In [26]:
# Step 4.2: Train Support Vector Machine (SVM)
svm_model = SVR(kernel='rbf', C=100, epsilon=0.1)
svm_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten X_train for SVM

  y = column_or_1d(y, warn=True)


In [27]:
# Make predictions
svm_predictions = svm_model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten X_test for SVM

# Inverse transform the predictions
svm_predicted_prices = target_scaler.inverse_transform(svm_predictions.reshape(-1, 1))

# Evaluate the model
svm_rmse = math.sqrt(mean_squared_error(target_scaler.inverse_transform(y_test.reshape(-1, 1)), svm_predicted_prices))
print(f"SVM RMSE: {svm_rmse}")

SVM RMSE: 0.19396282936791898


Artificial Neural Netweorks with similar complexity

In [28]:
# Step 4.3: Train Artificial Neural Network (ANN)
ann_model = MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42)
ann_model.fit(X_train.reshape(X_train.shape[0], -1), y_train)  # Flatten X_train for ANN


  y = column_or_1d(y, warn=True)


In [29]:
# Make predictions
ann_predictions = ann_model.predict(X_test.reshape(X_test.shape[0], -1))  # Flatten X_test for ANN

# Inverse transform the predictions
ann_predicted_prices = target_scaler.inverse_transform(ann_predictions.reshape(-1, 1))

# Evaluate the model
ann_rmse = math.sqrt(mean_squared_error(target_scaler.inverse_transform(y_test.reshape(-1, 1)), ann_predicted_prices))
print(f"ANN RMSE: {ann_rmse}")

ANN RMSE: 0.06897950551738526


In [30]:
# Fetch the versions of installed libraries
import pkg_resources


# Libraries you want to include
required_libraries = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'scikit-learn', 'tensorflow', 'yfinance']
# Fetch the Python version

# Fetch the installed version of each library
installed_versions = {pkg.key: pkg.version for pkg in pkg_resources.working_set if pkg.key in required_libraries}

# Write the specific libraries and versions to the requirements.txt file
with open("requirements.txt", "w") as f:
    for library, version in installed_versions.items():
        f.write(f"{library}=={version}\n")

print("Filtered requirements.txt file with selected libraries has been created.")



Filtered requirements.txt file with selected libraries has been created.


In [31]:
import sys
# Display Python version
sys.version

'3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]'

# Stock Price Prediction using LSTM

This repository contains a Jupyter Notebook that demonstrates how to predict stock prices using a Long Short-Term Memory (LSTM) neural network. The notebook fetches historical stock data, preprocesses it, engineers relevant features, builds and trains an LSTM model, and evaluates its performance. It also includes a comparison with other predictive models like Random Forest, Support Vector Machine (SVM), and Artificial Neural Network (ANN).

## Table of Contents

- [Project Description](#project-description)
- [Setup and Installation](#setup-and-installation)
- [Usage](#usage)
- [Project Structure](#project-structure)
- [Results](#results)
- [Contributing](#contributing)
- [License](#license)

## Project Description

This project aims to predict future stock prices based on historical data. It utilizes an LSTM model, which is well-suited for time series forecasting tasks. The process involves:

1.  Fetching historical stock data using the `yfinance` library.
2.  Preprocessing the data by handling missing values and normalizing the features.
3.  Feature engineering to create new relevant indicators like Daily Return, RSI, MACD, and Bollinger Bands.
4.  Building and training an LSTM model.
5.  Evaluating the LSTM model using appropriate metrics like RMSE.
6.  Comparing the LSTM model's performance with other models (Random Forest, SVM, ANN).

## Setup and Installation

To run this notebook, you need to have Python installed. You can then clone this repository and install the required libraries.

1.  **Clone the repository:**

In [32]:
    pip install -r requirements.txt



In [33]:
    pip install yfinance pandas scikit-learn tensorflow numpy matplotlib seaborn



In [34]:
    jupyter notebook stock_price_prediction.ipynb

SyntaxError: invalid syntax (ipython-input-332305198.py, line 1)

In [35]:
    jupyter lab stock_price_prediction.ipynb

SyntaxError: invalid syntax (ipython-input-1357665035.py, line 1)