<a href="https://colab.research.google.com/github/marcoapedroza/python-stock-analysis-ML-AI/blob/main/predictingStockPrices_AI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Predicting Stock Prices - Deep Learning Model

## Understand the problem statement and business case:
*   In this project, I will train a **deep neural network model** to predict future stock prices;
*   The AI model will be trained using **historical stock price** data along with the **volume** of transactions;
*   I will use a type of neural nets known as **Long Short-Term Memory Networks (LSTM).**

- **LSTM Model** is a **Recurrent Neural Network (RNN)**;
- It has a **feedback loop** that is designed to take temporal dimension into consideration;
- **Feedback Loop: A hidden layer** that gives an output and feeds itself;
- **RNNs** allow us to work with a sequence of **inputs, outputs, and both**.

### Import datasets and libraries:

In [None]:
import pandas as pd
import plotly.express as px
import seaborn as sns
from copy import copy
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import plotly.figure_factory as ff
import yfinance as yf
from tensorflow import keras
import warnings
warnings.filterwarnings('ignore')

In [None]:
tickers = 'ABEV3.SA B3SA3.SA BBDC3.SA BRKM5.SA CCRO3.SA ELET3.SA EMBR3.SA MGLU3.SA LAME4.SA PETR3.SA'
ticker = '^BVSP'

# Adjusted Close Stock Prices
bovespa = yf.download(ticker, start="2013-01-01")['Adj Close']
stock_prices = yf.download(tickers, start="2013-01-01")['Adj Close']
stock_prices.dropna(how='all', inplace=True)
stock_prices = pd.merge(stock_prices, bovespa, on='Date')
stock_prices.columns = ['ABEV3', 'B3SA3', 'BBDC3', 'BRKM5', 'CCRO3', 'ELET3', 'EMBR3', 'LAME4', 'MGLU3', 'PETR3', 'BOVESPA']
stock_prices

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  10 of 10 completed


Unnamed: 0_level_0,ABEV3,B3SA3,BBDC3,BRKM5,CCRO3,ELET3,EMBR3,LAME4,MGLU3,PETR3,BOVESPA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-02,12.468795,10.506537,11.001677,9.456411,13.490131,4.073360,13.784053,10.699188,0.362545,17.815228,62550.00000
2013-01-03,12.689861,10.506537,11.605797,9.657761,13.730900,4.224673,13.624968,10.699188,0.377499,18.479706,63312.00000
2013-01-04,12.480665,10.312937,11.577574,9.560557,13.971669,4.273094,13.344234,10.699188,0.375242,18.542559,62523.00000
2013-01-07,12.624581,10.260814,11.304679,9.435583,13.957504,4.230726,13.456529,10.699188,0.366495,18.300117,61933.00000
2013-01-08,12.609744,10.350169,11.267038,9.414756,13.950422,3.873626,12.885703,10.699188,0.363956,17.779306,61128.00000
...,...,...,...,...,...,...,...,...,...,...,...
2021-02-26,14.020000,54.299999,20.572170,31.260000,11.250000,32.410000,12.230000,24.910000,24.180000,22.150000,110035.00000
2021-03-01,13.770000,54.349998,19.902750,31.780001,10.920000,31.540001,12.210000,24.680000,24.549999,22.010000,110335.00000
2021-03-02,13.990000,56.009998,20.250000,30.510000,10.880000,31.920000,12.420000,24.250000,24.270000,21.910000,111540.00000
2021-03-03,14.170000,56.590000,20.660000,29.510000,11.030000,31.719999,12.430000,23.910000,25.120001,20.969999,111184.00000


In [None]:
# Checking null values
stock_prices.isnull().sum()

ABEV3      0
B3SA3      0
BBDC3      0
BRKM5      0
CCRO3      0
ELET3      0
EMBR3      0
LAME4      0
MGLU3      0
PETR3      0
BOVESPA    0
dtype: int64

In [None]:
stock_prices.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2018 entries, 2013-01-02 to 2021-03-04
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ABEV3    2018 non-null   float64
 1   B3SA3    2018 non-null   float64
 2   BBDC3    2018 non-null   float64
 3   BRKM5    2018 non-null   float64
 4   CCRO3    2018 non-null   float64
 5   ELET3    2018 non-null   float64
 6   EMBR3    2018 non-null   float64
 7   LAME4    2018 non-null   float64
 8   MGLU3    2018 non-null   float64
 9   PETR3    2018 non-null   float64
 10  BOVESPA  2018 non-null   float64
dtypes: float64(11)
memory usage: 189.2 KB


In [None]:
stock_prices.describe()

Unnamed: 0,ABEV3,B3SA3,BBDC3,BRKM5,CCRO3,ELET3,EMBR3,LAME4,MGLU3,PETR3,BOVESPA
count,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0
mean,15.383121,22.011867,16.988772,24.137012,12.910782,14.931697,18.185402,16.377329,4.303964,17.681822,70731.557684
std,2.420674,15.48459,5.495871,12.633479,2.000848,10.755015,4.80013,5.468622,6.809264,6.648659,21828.284796
min,10.694901,7.134462,8.493078,7.464851,6.946203,2.880558,6.03,8.767794,0.029213,5.541526,37497.0
25%,12.901473,9.715415,12.293642,12.247512,11.705986,4.831934,16.104088,12.594509,0.209049,12.939808,52442.5
50%,15.73772,16.564059,15.87484,22.024253,13.013364,13.092043,18.506442,15.358346,0.470104,15.851011,62840.5
75%,16.943798,29.620212,20.524171,32.287373,14.192291,21.412553,21.276763,18.445795,5.22533,22.303133,86945.5
max,22.276297,65.891556,31.260302,55.835045,19.331476,40.05455,29.471241,36.209534,27.421442,32.011372,125077.0


In [None]:
ticker_list = ['ABEV3.SA', 'B3SA3.SA', 'BBDC3.SA', 'BRKM5.SA', 'CCRO3.SA', 'ELET3.SA', 'EMBR3.SA', 'LAME4.SA', 'MGLU3.SA', 'PETR3.SA', '^BVSP']
stock_volumes = {}
for i in ticker_list:
  stock = yf.Ticker(i)
  stock_volumes[i] = stock.history(start="2013-01-01")['Volume']

stock_volumes = pd.DataFrame(stock_volumes)
stock_volumes.columns = ['ABEV3', 'B3SA3', 'BBDC3', 'BRKM5', 'CCRO3', 'ELET3', 'EMBR3', 'LAME4', 'MGLU3', 'PETR3', 'BOVESPA']
stock_volumes

Unnamed: 0_level_0,ABEV3,B3SA3,BBDC3,BRKM5,CCRO3,ELET3,EMBR3,LAME4,MGLU3,PETR3,BOVESPA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-02,1126114.0,11064300.0,2602296.0,2804100.0,3288600.0,3071866.0,2206900.0,0.0,23622400.0,10258500.0,3739800.0
2013-01-03,1032105.0,8380900.0,4885249.0,2414300.0,2501600.0,3234228.0,2323400.0,0.0,30777600.0,7823900.0,3355800.0
2013-01-04,1488651.0,13990200.0,2992734.0,2541200.0,4681500.0,9874150.0,2179500.0,0.0,17756800.0,16028100.0,6233800.0
2013-01-07,1223124.0,7149400.0,2409758.0,1717400.0,3609400.0,4051739.0,1242600.0,0.0,23110400.0,10473900.0,3985800.0
2013-01-08,1103112.0,12690100.0,3090168.0,1309400.0,4516000.0,3794841.0,3511500.0,0.0,27945600.0,10505300.0,3840600.0
...,...,...,...,...,...,...,...,...,...,...,...
2021-02-26,33969400.0,29567900.0,10127100.0,4101200.0,19692800.0,8549300.0,17922400.0,13418000.0,45690100.0,47419800.0,14618500.0
2021-03-01,19850200.0,14192200.0,6306500.0,5013800.0,8929400.0,9211600.0,14282400.0,8992500.0,22131200.0,40852500.0,11239900.0
2021-03-02,42904000.0,35949300.0,10606200.0,4770600.0,17453500.0,10829700.0,16942600.0,14144900.0,35351400.0,52848600.0,14823200.0
2021-03-03,40193500.0,31255500.0,11874800.0,6160000.0,14031200.0,8120500.0,18422000.0,22545000.0,47195300.0,55664000.0,15144300.0


In [None]:
stock_volumes.isna().sum()

ABEV3      1
B3SA3      1
BBDC3      1
BRKM5      1
CCRO3      1
ELET3      1
EMBR3      1
LAME4      1
MGLU3      1
PETR3      1
BOVESPA    7
dtype: int64

In [None]:
stock_volumes = stock_volumes.dropna()
stock_volumes

Unnamed: 0_level_0,ABEV3,B3SA3,BBDC3,BRKM5,CCRO3,ELET3,EMBR3,LAME4,MGLU3,PETR3,BOVESPA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2013-01-02,1126114.0,11064300.0,2602296.0,2804100.0,3288600.0,3071866.0,2206900.0,0.0,23622400.0,10258500.0,3739800.0
2013-01-03,1032105.0,8380900.0,4885249.0,2414300.0,2501600.0,3234228.0,2323400.0,0.0,30777600.0,7823900.0,3355800.0
2013-01-04,1488651.0,13990200.0,2992734.0,2541200.0,4681500.0,9874150.0,2179500.0,0.0,17756800.0,16028100.0,6233800.0
2013-01-07,1223124.0,7149400.0,2409758.0,1717400.0,3609400.0,4051739.0,1242600.0,0.0,23110400.0,10473900.0,3985800.0
2013-01-08,1103112.0,12690100.0,3090168.0,1309400.0,4516000.0,3794841.0,3511500.0,0.0,27945600.0,10505300.0,3840600.0
...,...,...,...,...,...,...,...,...,...,...,...
2021-02-26,33969400.0,29567900.0,10127100.0,4101200.0,19692800.0,8549300.0,17922400.0,13418000.0,45690100.0,47419800.0,14618500.0
2021-03-01,19850200.0,14192200.0,6306500.0,5013800.0,8929400.0,9211600.0,14282400.0,8992500.0,22131200.0,40852500.0,11239900.0
2021-03-02,42904000.0,35949300.0,10606200.0,4770600.0,17453500.0,10829700.0,16942600.0,14144900.0,35351400.0,52848600.0,14823200.0
2021-03-03,40193500.0,31255500.0,11874800.0,6160000.0,14031200.0,8120500.0,18422000.0,22545000.0,47195300.0,55664000.0,15144300.0


In [None]:
stock_volumes.describe()

Unnamed: 0,ABEV3,B3SA3,BBDC3,BRKM5,CCRO3,ELET3,EMBR3,LAME4,MGLU3,PETR3,BOVESPA
count,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0,2018.0
mean,15485070.0,11935930.0,3250898.0,2153238.0,6526745.0,3369251.0,4358928.0,4581355.0,40545340.0,14187080.0,4721757.0
std,14474560.0,9097548.0,2537588.0,2085193.0,5532530.0,3070226.0,6083609.0,3999642.0,31486740.0,13450560.0,2584210.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8658622.0,7421600.0,1874830.0,1225575.0,3840450.0,1661731.0,1768975.0,2282546.0,22018400.0,7563800.0,3194525.0
50%,12699400.0,10169500.0,2625311.0,1704650.0,5367000.0,2642903.0,2514100.0,3306794.0,33830600.0,11117200.0,3902400.0
75%,19538880.0,14040080.0,3960135.0,2494400.0,7744225.0,4081362.0,3902050.0,5594413.0,49814900.0,16549720.0,5188125.0
max,287695200.0,207270300.0,61121540.0,54813260.0,125070000.0,47532200.0,125473100.0,64140200.0,430464000.0,310419400.0,21768700.0


# Preparing the data before training the AI model:



*   Data set is divided into 70% for training and 30% for testing:
 * Training Set: used for model training;
 * Testing Set: used for testing trained model. It has never been seen by the trained model before.

In [None]:
# Function to concatenate the stock price, and volume in one dataframe
def concat_stock(price, volume, name):
  return pd.DataFrame({'Close': price[name], 'Volume': volume[name]})

In [None]:
# Today, the target stock price will be tomorrow's price
# Function that returns a target data for AI/ML model
def target(data):
  # 1 day window
  n = 1
  # Create a column containing the prices for the 'next 1' days
  # Shifts my data by n number of samples, backwards
  data['Target'] = data[['Close']].shift(-n)

  return data

In [None]:
# Testing the function and getting individual stock prices and volumes for CCRO3
grupoCCR = concat_stock(stock_prices, stock_volumes, 'CCRO3')
grupoCCR

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-02,13.490131,3288600.0
2013-01-03,13.730900,2501600.0
2013-01-04,13.971669,4681500.0
2013-01-07,13.957504,3609400.0
2013-01-08,13.950422,4516000.0
...,...,...
2021-02-26,11.250000,19692800.0
2021-03-01,10.920000,8929400.0
2021-03-02,10.880000,17453500.0
2021-03-03,11.030000,14031200.0


In [None]:
# Getting the close and volume data as training data (input) - PRICE AND VOLUME
trainingCCR = grupoCCR.values
trainingCCR

array([[1.34901314e+01, 3.28860000e+06],
       [1.37308998e+01, 2.50160000e+06],
       [1.39716692e+01, 4.68150000e+06],
       ...,
       [1.08800001e+01, 1.74535000e+07],
       [1.10299997e+01, 1.40312000e+07],
       [1.14899998e+01, 4.52120000e+06]])

In [None]:
# Normalize
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range=(0, 1))
trainingCCR_set_scaled = sc.fit_transform(trainingCCR)
trainingCCR_set_scaled

array([[0.52836366, 0.02629407],
       [0.54780356, 0.0200016 ],
       [0.56724353, 0.03743103],
       ...,
       [0.31761891, 0.13954983],
       [0.32973004, 0.11218676],
       [0.36687092, 0.03614935]])

In [None]:
# Creating the training and testing data, training data contains present day and previous day values
X = []
y = []


for i in range(1, len(grupoCCR)):

  X.append(trainingCCR_set_scaled[i-1:i, 0])
  y.append(trainingCCR_set_scaled[i, 0])

X[0:5]

[array([0.52836366]),
 array([0.54780356]),
 array([0.56724353]),
 array([0.56609984]),
 array([0.56552803])]

In [None]:
y[0:5]

[0.5478035558909363,
 0.5672435300187112,
 0.5660998390686,
 0.5655280320938793,
 0.5672435300187112]

In [None]:
# To convert the data into array format
X = np.asarray(X)
y = np.asarray(y)
print(f'X shape:{X.shape} \ny shape:{y.shape}')

X shape:(2017, 1) 
y shape:(2017,)


In [None]:
# Splitting the data training - testing
split = int(0.7 * len(X))
X_train = X[:split]
y_train = y[:split]
X_test = X[split:]
y_test = y[split:]
print(f'X_train shape:{X_train.shape} \ny_train shape:{y_train.shape}')

X_train shape:(1411, 1) 
y_train shape:(1411,)


In [None]:
print(f'X_test shape:{X_test.shape} \ny_test shape:{y_test.shape}')

X_test shape:(606, 1) 
y_test shape:(606,)


In [None]:
# Reshaping the dimention - 1D --> 3D array - to feed in the LSMT
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
X_train.shape, X_test.shape

((1411, 1, 1), (606, 1, 1))

In [None]:
# Creating the model
inputs = keras.layers.Input(shape = (X_train.shape[1], X_train.shape[2]))

x = keras.layers.LSTM(150, return_sequences=True)(inputs)
x = keras.layers.LSTM(150, return_sequences=True)(x)
x = keras.layers.LSTM(150, return_sequences=True)(x)

outputs = keras.layers.Dense(1, activation='linear')(x)

model = keras.Model(inputs = inputs, outputs = outputs)
model.compile(optimizer = 'adam', loss = 'mse')
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 1, 1)]            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 1, 150)            91200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 1, 150)            180600    
_________________________________________________________________
lstm_5 (LSTM)                (None, 1, 150)            180600    
_________________________________________________________________
dense_1 (Dense)              (None, 1, 1)              151       
Total params: 452,551
Trainable params: 452,551
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
# Making a prediction
predicted = model.predict(X)

In [None]:
df_predicted = grupoCCR[1:][['Close']]


# trainingCCR_set_scaled - where the original prices were
close = []

for i in trainingCCR_set_scaled:
  close.append(i[0])


# To append the predicted values to a list
test_predicted = []

for i in predicted:
  test_predicted.append(i[0][0])


In [None]:
df_predicted['Close'] = close[1:]
df_predicted['Predictions'] = test_predicted
df_predicted

Unnamed: 0_level_0,Close,Predictions
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2013-01-03,0.547804,0.530223
2013-01-04,0.567244,0.550071
2013-01-07,0.566100,0.570087
2013-01-08,0.565528,0.568905
2013-01-09,0.567244,0.568314
...,...,...
2021-02-26,0.347493,0.371113
2021-03-01,0.320849,0.353656
2021-03-02,0.317619,0.328877
2021-03-03,0.329730,0.325895


In [None]:
# Function to plot interactive plots using Plotly Express
def interactive_plot(df, title):
  fig = px.line(title = title)
  for i in df.columns:
    fig.add_scatter(x = df.index, y = df[i], name = i)
  
  fig.show()

In [None]:
interactive_plot(df_predicted, 'Original Price vs. LSTM Predictions')