## Installing Libraries

In [389]:
!pip install pandas_datareader
!pip install plotly
!pip install fredapi
!pip install xgboost

Collecting xgboost
  Obtaining dependency information for xgboost from https://files.pythonhosted.org/packages/24/ec/ad387100fa3cc2b9b81af0829b5ecfe75ec5bb19dd7c19d4fea06fb81802/xgboost-2.0.3-py3-none-win_amd64.whl.metadata
  Downloading xgboost-2.0.3-py3-none-win_amd64.whl.metadata (2.0 kB)
Downloading xgboost-2.0.3-py3-none-win_amd64.whl (99.8 MB)
   ---------------------------------------- 0.0/99.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/99.8 MB 320.0 kB/s eta 0:05:12
   ---------------------------------------- 0.1/99.8 MB 907.3 kB/s eta 0:01:50
   ---------------------------------------- 0.5/99.8 MB 3.3 MB/s eta 0:00:30
    --------------------------------------- 1.5/99.8 MB 7.9 MB/s eta 0:00:13
   - -------------------------------------- 3.2/99.8 MB 14.8 MB/s eta 0:00:07
   -- ------------------------------------- 5.0/99.8 MB 17.7 MB/s eta 0:00:06
   -- ------------------------------------- 6.6/99.8 MB 20.9 MB/s eta 0:00:05
   --- -------------------------

## Importing Libraries

In [400]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from pandas_datareader import DataReader
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import yfinance as yf
import warnings
import statsmodels.api as sm
warnings.filterwarnings("ignore")
import plotly.figure_factory as ff
from sklearn.model_selection import train_test_split
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error, r2_score

## Data Extraction

In [582]:
start_date = datetime(2019, 1, 1)
end_date = datetime(2023, 12, 31)
stock_nvda = 'NVDA'
stock = yf.download(stock_nvda, start_date, end_date)
df = pd.DataFrame(stock)
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-02,32.660000,34.619999,32.512501,34.055000,33.788895,50875200
2019-01-03,33.447498,33.790001,31.922501,31.997499,31.747473,70555200
2019-01-04,32.735001,34.432499,32.424999,34.047501,33.781448,58562000
2019-01-07,34.625000,36.222500,34.107498,35.849998,35.569862,70916000
2019-01-08,36.672501,36.695000,34.224998,34.957500,34.684338,78601600
...,...,...,...,...,...,...
2023-12-22,491.950012,493.829987,484.670013,488.299988,488.277069,25213900
2023-12-26,489.679993,496.000000,489.600006,492.790009,492.766907,24420000
2023-12-27,495.109985,496.799988,490.850006,494.170013,494.146820,23364800
2023-12-28,496.429993,498.839996,494.119995,495.220001,495.196777,24658700


In [184]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2019-01-02 to 2023-12-29
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1258 non-null   float64
 1   High       1258 non-null   float64
 2   Low        1258 non-null   float64
 3   Close      1258 non-null   float64
 4   Adj Close  1258 non-null   float64
 5   Volume     1258 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 68.8 KB


In [583]:
summary_stats=df.describe()

In [584]:
header_values = summary_stats.columns.insert(0, "Statistics")
cell_values = [summary_stats.index] + [summary_stats[col].round(2).tolist() for col in summary_stats.columns]

# Create the table
fig = go.Figure(data=[go.Table(
    header=dict(values=header_values, fill_color='paleturquoise', align='left'),
    cells=dict(values=cell_values, fill_color='lavender', align='left'))
])

fig.update_layout(title='Descriptive Summary of NVDA Stock')
fig.show()

In [187]:
df = stock.reset_index()

# Display the DataFrame
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-01-02,32.660000,34.619999,32.512501,34.055000,33.788895,50875200
1,2019-01-03,33.447498,33.790001,31.922501,31.997499,31.747473,70555200
2,2019-01-04,32.735001,34.432499,32.424999,34.047501,33.781448,58562000
3,2019-01-07,34.625000,36.222500,34.107498,35.849998,35.569862,70916000
4,2019-01-08,36.672501,36.695000,34.224998,34.957500,34.684338,78601600
...,...,...,...,...,...,...,...
1253,2023-12-22,491.950012,493.829987,484.670013,488.299988,488.277069,25213900
1254,2023-12-26,489.679993,496.000000,489.600006,492.790009,492.766907,24420000
1255,2023-12-27,495.109985,496.799988,490.850006,494.170013,494.146820,23364800
1256,2023-12-28,496.429993,498.839996,494.119995,495.220001,495.196777,24658700


## Exploratory Data Analysis

## NVDA STOCK GRAPH

In [188]:
fig = go.Figure(data=[go.Candlestick(x=df['Date'],
                open=df['Open'],
                high=df['High'],
                low=df['Low'],
                close=df['Close'])])

fig.update_xaxes(
    rangeslider_visible=True,
    rangeselector=dict(
        buttons=list([
            dict(count=1, label="1m", step="month", stepmode="backward"),
            dict(count=6, label="6m", step="month", stepmode="backward"),
            dict(count=1, label="YTD", step="year", stepmode="todate"),
            dict(count=1, label="1y", step="year", stepmode="backward"),
            dict(step="all")
        ])
    )
)

fig.update_layout(title='Candlestick Graph of NVIDIA',
                  xaxis_title='Date',
                  yaxis_title='Price($)')

fig.show()


## NVDA STOCK VALUE TRENDS

In [189]:
# Create subplots
fig = make_subplots(rows=3, cols=2, subplot_titles=('Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'))


# Define plot types for each subplot
fig.add_trace(go.Scatter(x=df['Date'], y=df['Open'], name="Open"), row=1, col=1)
fig.add_trace(go.Scatter(x=df['Date'], y=df['High'], name="High"), row=1, col=2)
fig.add_trace(go.Scatter(x=df['Date'], y=df['Low'], name="Low"), row=2, col=1)
fig.add_trace(go.Scatter(x=df['Date'], y=df['Close'], name="Close"), row=2, col=2)
fig.add_trace(go.Scatter(x=df['Date'], y=df['Adj Close'], name="Adj Close"), row=3, col=1)
fig.add_trace(go.Scatter(x=df['Date'], y=df['Volume'], name="Volume"), row=3, col=2)

# Update layout
fig.update_layout(height=1200, width=1000, title_text="NVIDIA Stock Value Trend", xaxis_title='Date',
                  yaxis_title='Price($)', showlegend=True)

# Show figure
fig.show()

In [190]:
df_vol = df.copy()

## TOTAL VOLUME OF STOCKS TRADED BY YEAR, MONTH AND WEEKDAY

In [191]:
# Group by Year and sum the Volume
yearly_volume = df_vol.groupby(df_vol['Date'].dt.year)['Volume'].sum().reset_index()

# Group by Month and sum the Volume
monthly_volume = df_vol.groupby(df_vol['Date'].dt.month)['Volume'].sum().reset_index()

# Group by Weekday and sum the Volume
weekday_volume = df_vol.groupby(df_vol['Date'].dt.weekday)['Volume'].sum().reset_index()

# Create subplots
fig = make_subplots(rows=3, cols=1, subplot_titles=('Yearly Volume', 'Monthly Volume', 'Weekday Volume'))

# Add traces for each subplot
fig.add_trace(go.Scatter(x=yearly_volume['Date'], y=yearly_volume['Volume'], name="Yearly"), row=1, col=1)
fig.add_trace(go.Scatter(x=monthly_volume['Date'], y=monthly_volume['Volume'],name="Monthly"), row=2, col=1)
fig.add_trace(go.Scatter(x=weekday_volume['Date'], y=weekday_volume['Volume'], name="Weekday"), row=3, col=1)

# Update layout
fig.update_layout(
    height=1200, 
    width=700, 
    showlegend=True,
    title_text="Total Volume of Stocks Traded by Year, Month, and Weekday"
)

# Customize x-axis and y-axis titles
fig.update_xaxes(title_text="Year", row=1, col=1)
fig.update_xaxes(title_text="Month", row=2, col=1)
fig.update_xaxes(title_text="Weekday", row=3, col=1)
fig.update_yaxes(title_text="Total Stock Volumes", row=1, col=1)
fig.update_yaxes(title_text="Total Stock Volumes", row=2, col=1)
fig.update_yaxes(title_text="Total Stock Volumes", row=3, col=1)

# Show figure
fig.show()

## RETURNS OF NVDA 

In [192]:
# Calculate daily returns as a percentage
df['Returns'] = df['Adj Close'].pct_change().fillna(0)

# Create a Plotly graph
fig = go.Figure()

# Add trace for the returns
fig.add_trace(go.Scatter(x=df['Date'], y=df['Returns'], mode='lines', name='Daily Returns'))

# Update layout for a better look
fig.update_layout(
    title='Percentage Change of Adj Close Price of NVDA',
    xaxis_title='Date',
    yaxis_title='Returns',
    showlegend=True,
    template='ggplot2'
)

# Display the figure
fig.show()

## KERNEL DENSITY ESTIMATE OF NVDA

In [193]:
# Create a KDE plot
fig = ff.create_distplot([df['Adj Close'].dropna()], ['NVDA Adj Closing Prices'], show_hist=False, show_rug=False)
fig.data[0].update(fill='tozeroy')
fig.update_layout(title='Kernel Density Estimate of NVIDIA',
                  xaxis_title='Price ($)',
                  yaxis_title='Density',
                  template='ggplot2') 

# Display the figure
fig.show()

## NVIDIA AND ITS COMPETITORS COMPARISION & ANALYSIS

## CLOSE PRICE COMPARISION

In [194]:
# Define the stocks and fetch data
stocks = ['NVDA', 'AMD', 'INTC', 'QCOM']
data = yf.download(stocks, start="2019-01-01", end="2023-12-31")['Adj Close']

# Create a Plotly graph object figure
fig = go.Figure()

# Add a scatter plot trace for each stock
for stock in stocks:
    fig.add_trace(
        go.Scatter(
            x=data.index,
            y=data[stock],  
            name=stock
        )
    )

# Update layout for a better look
fig.update_layout(
    title='Competitors and Nvidia Stocks Adj Close Price',
    xaxis_title='Date',
    yaxis_title='Price($)',
    template='ggplot2'
)

# Show figure
fig.show()


[*********************100%%**********************]  4 of 4 completed


## RETURNS COMPARISION

In [195]:
# Calculate the daily percentage change
percentage_changes = data.pct_change() 

# Create a Plotly graph object figure
fig = go.Figure()

# Add a scatter plot trace for the percentage change of each stock
for stock in stocks:
    fig.add_trace(
        go.Scatter(
            x=percentage_changes.index,
            y=percentage_changes[stock],
            name=stock
        )
    )
A
# Update layout for a better look
fig.update_layout(
    title='Daily Percentage Change Comparison',
    xaxis_title='Date',
    yaxis_title='Daily Returns',
    template='ggplot2'
)

# Show figure
fig.show()

## STOCKS VOLUME COMPARISION

In [196]:
# Define the stocks and fetch data
stocks = ['NVDA', 'AMD', 'INTC', 'QCOM']
data = yf.download(stocks, start="2019-01-01", end="2023-12-31")['Volume']

# Create a Plotly graph object figure
fig = go.Figure()

# Add a scatter plot trace for each stock
for stock in stocks:
    fig.add_trace(
        go.Scatter(
            x=data.index,
            y=data[stock],  
            name=stock
        )
    )

# Update layout for a better look
fig.update_layout(
    title='Competitors and Nvidia Stocks Volume',
    xaxis_title='Date',
    yaxis_title='Volume',
    template='ggplot2'
)

# Show figure
fig.show()

[*********************100%%**********************]  4 of 4 completed


## KERNEL DENSITY ESTIMATES OF NVDA AND ITS COMPETITORS

In [197]:
# Create a figure for KDE
fig = ff.create_distplot([data[stock].dropna() for stock in stocks], stocks, show_hist=False, show_rug=False, curve_type='kde')
# Update layout for a better look
fig.update_layout(
    title='Kernel Density Estimates of Adj Closing Prices for NVDA, AMD, INTC, QCOM',
    xaxis_title='Price ($)',
    yaxis_title='Density',
    template='ggplot2'
)

# Show the figure
fig.show()

## CORRELATION MATRIX OF NVDA AND ITS COMPETITORS

In [198]:
# Calculate the correlation matrix
correlation_matrix = data.corr()

# Create a heatmap
fig = go.Figure(data=go.Heatmap(
                    z=correlation_matrix,
                    x=correlation_matrix.columns,
                    y=correlation_matrix.index,
                    colorscale='Blues'))

# Update layout for a better look
fig.update_layout(
    title='Correlation Matrix of Stocks Closing Prices',
    xaxis_title='Stocks',
    yaxis_title='Stocks',
    xaxis=dict(tickmode='array', tickvals=[i for i in range(len(stocks))], ticktext=stocks),
    yaxis=dict(tickmode='array', tickvals=[i for i in range(len(stocks))], ticktext=stocks)
)

# Show the figure
fig.show()


## CONSTRUCTING FEATURE DATABASE

In [200]:
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Returns
0,2019-01-02,32.660000,34.619999,32.512501,34.055000,33.788895,50875200,0.000000
1,2019-01-03,33.447498,33.790001,31.922501,31.997499,31.747473,70555200,-0.060417
2,2019-01-04,32.735001,34.432499,32.424999,34.047501,33.781448,58562000,0.064067
3,2019-01-07,34.625000,36.222500,34.107498,35.849998,35.569862,70916000,0.052941
4,2019-01-08,36.672501,36.695000,34.224998,34.957500,34.684338,78601600,-0.024895
...,...,...,...,...,...,...,...,...
1253,2023-12-22,491.950012,493.829987,484.670013,488.299988,488.277069,25213900,-0.003266
1254,2023-12-26,489.679993,496.000000,489.600006,492.790009,492.766907,24420000,0.009195
1255,2023-12-27,495.109985,496.799988,490.850006,494.170013,494.146820,23364800,0.002800
1256,2023-12-28,496.429993,498.839996,494.119995,495.220001,495.196777,24658700,0.002125


## FAMA FRENCH 5 FACTORS

In [201]:
data_ff5 = pd.read_csv('F-F_Research_Data_5_Factors_daily.csv')
print(data_ff5)

           Date  Mkt-RF   SMB   HML   RMW   CMA     RF
0      19630701   -0.67  0.02 -0.35  0.03  0.13  0.012
1      19630702    0.79 -0.28  0.28 -0.08 -0.21  0.012
2      19630703    0.63 -0.18 -0.10  0.13 -0.25  0.012
3      19630705    0.40  0.09 -0.28  0.07 -0.30  0.012
4      19630708   -0.63  0.07 -0.20 -0.27  0.06  0.012
...         ...     ...   ...   ...   ...   ...    ...
15245  20240125    0.46  0.20  0.56 -0.19  0.55  0.022
15246  20240126   -0.02  0.35 -0.27  0.22 -0.02  0.022
15247  20240129    0.85  0.89 -0.59 -1.13 -0.31  0.022
15248  20240130   -0.13 -0.96  0.84  1.00  0.13  0.022
15249  20240131   -1.74 -0.89 -0.30 -0.21  0.37  0.022

[15250 rows x 7 columns]


In [202]:
data_ff5['Date'] = pd.to_datetime(data_ff5['Date'], format='%Y%m%d')
df_ffs = data_ff5.set_index('Date')
print(data_ff5)

            Date  Mkt-RF   SMB   HML   RMW   CMA     RF
0     1963-07-01   -0.67  0.02 -0.35  0.03  0.13  0.012
1     1963-07-02    0.79 -0.28  0.28 -0.08 -0.21  0.012
2     1963-07-03    0.63 -0.18 -0.10  0.13 -0.25  0.012
3     1963-07-05    0.40  0.09 -0.28  0.07 -0.30  0.012
4     1963-07-08   -0.63  0.07 -0.20 -0.27  0.06  0.012
...          ...     ...   ...   ...   ...   ...    ...
15245 2024-01-25    0.46  0.20  0.56 -0.19  0.55  0.022
15246 2024-01-26   -0.02  0.35 -0.27  0.22 -0.02  0.022
15247 2024-01-29    0.85  0.89 -0.59 -1.13 -0.31  0.022
15248 2024-01-30   -0.13 -0.96  0.84  1.00  0.13  0.022
15249 2024-01-31   -1.74 -0.89 -0.30 -0.21  0.37  0.022

[15250 rows x 7 columns]


## ADS INDEX

In [203]:
# Read the Excel file
data_ads = pd.read_excel(r'ADS_Index_Most_Current_Vintage.xlsx')

data_ads.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)

data_ads['Date'] = pd.to_datetime(data_ads['Date'], format='%Y:%m:%d')

print(data_ads)

            Date  ADS_Index
0     1960-03-01  -0.569062
1     1960-03-02  -0.617724
2     1960-03-03  -0.663306
3     1960-03-04  -0.705824
4     1960-03-05  -0.745294
...          ...        ...
23387 2024-03-12  -0.046230
23388 2024-03-13  -0.044420
23389 2024-03-14  -0.042867
23390 2024-03-15  -0.041571
23391 2024-03-16  -0.040532

[23392 rows x 2 columns]


## FRED 

In [205]:
from fredapi import Fred
# Initialize the FRED API object with your API key
fred = Fred(api_key='1f35c35c33d017082e6e4af237880935')

# List of variables
varList = ['T10Y3M', 'OBMMIJUMBO30YF', 'DEXUSEU', 'DEXJPUS', 'DEXUSUK',
           'CBBTCUSD', 'CBETHUSD', 'T10YIE', 'DCOILBRENTEU', 'VIXCLS',
           'DAAA', 'DBAA', 'NIKKEI225', 'AMERIBOR', 'T5YIE',
           'BAMLH0A0HYM2','BAMLH0A0HYM2EY', 'DGS10', 'DGS1',
           'RIFSPPFAAD90NB', 'DCPN3M', 'DCPF1M', 'DCOILWTICO',
           'DHHNGSP', 'USRECD', 'USRECDM', 'USRECDP', 'SP500']

# Fetch data for each variable and store it in a DataFrame
df_fred = pd.DataFrame()
for var in varList:
    series = fred.get_series(var)
    series.name = var
    df_fred[var] = series

# Assuming df_fred is the DataFrame you provided
df_fred.index = pd.to_datetime(df_fred.index)
df_fred.reset_index(inplace=True)
df_fred.rename(columns={'index': 'Date'}, inplace=True)

# Display the updated DataFrame
print(df_fred)

            Date  T10Y3M  OBMMIJUMBO30YF  DEXUSEU  DEXJPUS  DEXUSUK  CBBTCUSD  \
0     1982-01-04    2.32             NaN      NaN   218.75   1.9260       NaN   
1     1982-01-05    2.24             NaN      NaN   219.70   1.9120       NaN   
2     1982-01-06    2.43             NaN      NaN   219.60   1.9253       NaN   
3     1982-01-07    2.46             NaN      NaN   222.00   1.9200       NaN   
4     1982-01-08    2.50             NaN      NaN   221.40   1.9200       NaN   
...          ...     ...             ...      ...      ...      ...       ...   
11027 2024-04-10   -0.90           7.240   1.0737   152.90   1.2544  70554.77   
11028 2024-04-11   -0.89           7.404   1.0722   153.19   1.2535  70001.36   
11029 2024-04-12   -0.95           7.191   1.0647   153.12   1.2457  67107.27   
11030 2024-04-15   -0.82           7.564      NaN      NaN      NaN  63359.28   
11031 2024-04-16   -0.78             NaN      NaN      NaN      NaN       NaN   

       CBETHUSD  T10YIE  DC

## DATA PRE-PROCESSING

In [206]:
data_part1 = pd.merge(data_ff5, data_ads, left_index=True, right_index=True, how='outer')
data_part2 = pd.merge(data_part1, df_fred, left_index=True, right_index=True, how='outer')

data_part2.fillna(method='ffill', inplace=True)

data_part2.reset_index(inplace=True)
data_part2.rename(columns={'index': 'Date'}, inplace=True)

data_part2_f = data_part2[(data_part2['Date_x'] >= '2019-01-01') & (data_part2['Date_x'] <= '2023-12-31')]
print(data_part2_f)

        Date     Date_x  Mkt-RF   SMB   HML   RMW   CMA     RF     Date_y  \
13971  13971 2019-01-02    0.23  0.74  1.11 -0.12  0.27  0.010 1998-06-01   
13972  13972 2019-01-03   -2.45  0.48  1.20 -0.22  0.89  0.010 1998-06-02   
13973  13973 2019-01-04    3.55  0.36 -0.70 -0.11 -0.59  0.010 1998-06-03   
13974  13974 2019-01-07    0.94  0.87 -0.75 -0.72 -0.45  0.010 1998-06-04   
13975  13975 2019-01-08    1.01  0.44 -0.63  0.31 -0.09  0.010 1998-06-05   
...      ...        ...     ...   ...   ...   ...   ...    ...        ...   
15224  15224 2023-12-22    0.21  0.61  0.09 -0.64  0.19  0.021 2001-11-05   
15225  15225 2023-12-26    0.48  0.81  0.46 -0.34 -0.15  0.021 2001-11-06   
15226  15226 2023-12-27    0.16  0.16  0.12 -0.31 -0.14  0.021 2001-11-07   
15227  15227 2023-12-28   -0.01 -0.38  0.03 -0.32  0.15  0.021 2001-11-08   
15228  15228 2023-12-29   -0.43 -1.13 -0.37  0.68 -0.07  0.021 2001-11-09   

       ADS_Index  ...  DGS1  RIFSPPFAAD90NB  DCPN3M  DCPF1M  DCOILWTICO  \


In [207]:
data_part2_f.to_csv('INFO7374_FeatureMart_with_NVDA.csv', index=False)

In [323]:
df_featuremart = pd.read_csv('INFO7374_FeatureMart_with_NVDA.csv')

In [324]:
df_featuremart

Unnamed: 0,Date,Date_x,Mkt-RF,SMB,HML,RMW,CMA,RF,Date_y,ADS_Index,...,DGS1,RIFSPPFAAD90NB,DCPN3M,DCPF1M,DCOILWTICO,DHHNGSP,USRECD,USRECDM,USRECDP,SP500
0,13971,2019-01-02,0.23,0.74,1.11,-0.12,0.27,0.010,1998-06-01,0.004238,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
1,13972,2019-01-03,-2.45,0.48,1.20,-0.22,0.89,0.010,1998-06-02,-0.032800,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
2,13973,2019-01-04,3.55,0.36,-0.70,-0.11,-0.59,0.010,1998-06-03,-0.069725,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
3,13974,2019-01-07,0.94,0.87,-0.75,-0.72,-0.45,0.010,1998-06-04,-0.106551,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
4,13975,2019-01-08,1.01,0.44,-0.63,0.31,-0.09,0.010,1998-06-05,-0.143292,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1253,15224,2023-12-22,0.21,0.61,0.09,-0.64,0.19,0.021,2001-11-05,-0.704450,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
1254,15225,2023-12-26,0.48,0.81,0.46,-0.34,-0.15,0.021,2001-11-06,-0.722956,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
1255,15226,2023-12-27,0.16,0.16,0.12,-0.31,-0.14,0.021,2001-11-07,-0.740778,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82
1256,15227,2023-12-28,-0.01,-0.38,0.03,-0.32,0.15,0.021,2001-11-08,-0.757922,...,5.16,5.34,5.34,5.3,87.24,1.83,0.0,0.0,0.0,5061.82


In [325]:
df_featuremart['Date_x'] = pd.to_datetime(df_featuremart['Date_x'])

columns_to_drop = ['Date', 'Date_y']
df_cleaned = df_featuremart.drop(columns=['Date','Date_y'])

df_cleaned = df_cleaned.loc[:, df_cleaned.nunique() != 1]

df_cleaned = df_cleaned.drop_duplicates()

df_cleaned

Unnamed: 0,Date_x,Mkt-RF,SMB,HML,RMW,CMA,RF,ADS_Index
0,2019-01-02,0.23,0.74,1.11,-0.12,0.27,0.010,0.004238
1,2019-01-03,-2.45,0.48,1.20,-0.22,0.89,0.010,-0.032800
2,2019-01-04,3.55,0.36,-0.70,-0.11,-0.59,0.010,-0.069725
3,2019-01-07,0.94,0.87,-0.75,-0.72,-0.45,0.010,-0.106551
4,2019-01-08,1.01,0.44,-0.63,0.31,-0.09,0.010,-0.143292
...,...,...,...,...,...,...,...,...
1253,2023-12-22,0.21,0.61,0.09,-0.64,0.19,0.021,-0.704450
1254,2023-12-26,0.48,0.81,0.46,-0.34,-0.15,0.021,-0.722956
1255,2023-12-27,0.16,0.16,0.12,-0.31,-0.14,0.021,-0.740778
1256,2023-12-28,-0.01,-0.38,0.03,-0.32,0.15,0.021,-0.757922


In [326]:
df_cleaned.index = pd.to_datetime(df_cleaned.index)
df_cleaned.set_index('Date_x', inplace=True)
df_cleaned

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,RF,ADS_Index
Date_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-02,0.23,0.74,1.11,-0.12,0.27,0.010,0.004238
2019-01-03,-2.45,0.48,1.20,-0.22,0.89,0.010,-0.032800
2019-01-04,3.55,0.36,-0.70,-0.11,-0.59,0.010,-0.069725
2019-01-07,0.94,0.87,-0.75,-0.72,-0.45,0.010,-0.106551
2019-01-08,1.01,0.44,-0.63,0.31,-0.09,0.010,-0.143292
...,...,...,...,...,...,...,...
2023-12-22,0.21,0.61,0.09,-0.64,0.19,0.021,-0.704450
2023-12-26,0.48,0.81,0.46,-0.34,-0.15,0.021,-0.722956
2023-12-27,0.16,0.16,0.12,-0.31,-0.14,0.021,-0.740778
2023-12-28,-0.01,-0.38,0.03,-0.32,0.15,0.021,-0.757922


In [327]:
df_cleaned.to_csv('INFO7374_FeatureMart_Cleaned.csv')

In [328]:
# Fetch NVIDIA stock data
nvda_data = yf.download('NVDA', start='2019-01-01', end='2023-12-31')
nvda_data = nvda_data[['Adj Close']].pct_change().fillna(0)
nvda_data.rename(columns={'Adj Close': 'NVDA_Returns'}, inplace=True)

nvda_data.index = pd.to_datetime(nvda_data.index)

[*********************100%%**********************]  1 of 1 completed


In [329]:
nvda_data

Unnamed: 0_level_0,NVDA_Returns
Date,Unnamed: 1_level_1
2019-01-02,0.000000
2019-01-03,-0.060417
2019-01-04,0.064067
2019-01-07,0.052941
2019-01-08,-0.024895
...,...
2023-12-22,-0.003266
2023-12-26,0.009195
2023-12-27,0.002800
2023-12-28,0.002125


## LAG 

In [330]:
nvda_data['Lag_1'] = nvda_data['NVDA_Returns'].shift(1)
nvda_data['Lag_2'] = nvda_data['NVDA_Returns'].shift(2)
nvda_data['Lag_3'] = nvda_data['NVDA_Returns'].shift(3)

In [331]:
nvda_data

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-02,0.000000,,,
2019-01-03,-0.060417,0.000000,,
2019-01-04,0.064067,-0.060417,0.000000,
2019-01-07,0.052941,0.064067,-0.060417,0.000000
2019-01-08,-0.024895,0.052941,0.064067,-0.060417
...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445
2023-12-26,0.009195,-0.003266,0.018270,-0.030098
2023-12-27,0.002800,0.009195,-0.003266,0.018270
2023-12-28,0.002125,0.002800,0.009195,-0.003266


## MOMENTUM FACTOR

In [332]:
lookback_period = 3

# Calculate the momentum factor as the sum of returns over the lookback period
nvda_data['Momentum_factor'] = nvda_data['NVDA_Returns'].rolling(window=lookback_period).sum()

In [333]:
nvda_data

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3,Momentum_factor
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-02,0.000000,,,,
2019-01-03,-0.060417,0.000000,,,
2019-01-04,0.064067,-0.060417,0.000000,,0.003650
2019-01-07,0.052941,0.064067,-0.060417,0.000000,0.056591
2019-01-08,-0.024895,0.052941,0.064067,-0.060417,0.092113
...,...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445,-0.015094
2023-12-26,0.009195,-0.003266,0.018270,-0.030098,0.024200
2023-12-27,0.002800,0.009195,-0.003266,0.018270,0.008730
2023-12-28,0.002125,0.002800,0.009195,-0.003266,0.014120


In [334]:
nvda_data = nvda_data.dropna()
nvda_data

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3,Momentum_factor
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2019-01-07,0.052941,0.064067,-0.060417,0.000000,0.056591
2019-01-08,-0.024895,0.052941,0.064067,-0.060417,0.092113
2019-01-09,0.019667,-0.024895,0.052941,0.064067,0.047712
2019-01-10,0.018586,0.019667,-0.024895,0.052941,0.013358
2019-01-11,0.024788,0.018586,0.019667,-0.024895,0.063041
...,...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445,-0.015094
2023-12-26,0.009195,-0.003266,0.018270,-0.030098,0.024200
2023-12-27,0.002800,0.009195,-0.003266,0.018270,0.008730
2023-12-28,0.002125,0.002800,0.009195,-0.003266,0.014120


In [335]:
nvda_vol = yf.download('NVDA', start='2019-01-07', end='2023-12-31')

# Select only the Volume column
nvda_vol = nvda_vol[['Volume']]

# Ensure the index is a datetime index (this should already be the case with yfinance)
nvda_vol.index = pd.to_datetime(nvda_vol.index)
nvda_vol

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Volume
Date,Unnamed: 1_level_1
2019-01-07,70916000
2019-01-08,78601600
2019-01-09,61726000
2019-01-10,52315600
2019-01-11,87476400
...,...
2023-12-22,25213900
2023-12-26,24420000
2023-12-27,23364800
2023-12-28,24658700


In [336]:
merged_df = pd.merge(nvda_data, nvda_vol, left_index=True, right_index=True, how='inner')
merged_df

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3,Momentum_factor,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2019-01-07,0.052941,0.064067,-0.060417,0.000000,0.056591,70916000
2019-01-08,-0.024895,0.052941,0.064067,-0.060417,0.092113,78601600
2019-01-09,0.019667,-0.024895,0.052941,0.064067,0.047712,61726000
2019-01-10,0.018586,0.019667,-0.024895,0.052941,0.013358,52315600
2019-01-11,0.024788,0.018586,0.019667,-0.024895,0.063041,87476400
...,...,...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445,-0.015094,25213900
2023-12-26,0.009195,-0.003266,0.018270,-0.030098,0.024200,24420000
2023-12-27,0.002800,0.009195,-0.003266,0.018270,0.008730,23364800
2023-12-28,0.002125,0.002800,0.009195,-0.003266,0.014120,24658700


## RETURNS, LAGS, MOMENTUM FACTOR, VOLUME ANALYSIS 

In [350]:
# Create a subplot with 2 rows and 3 columns
fig = make_subplots(rows=3, cols=2, subplot_titles=('NVDA_Returns', 'Lag_1', 'Lag_2', 'Lag_3', 'Momentum_factor', 'Volume'))

# Add scatter plots for each column
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['NVDA_Returns'], mode='lines', name='Returns'), row=1, col=1)
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Lag_1'], mode='lines', name='Lag 1'), row=1, col=2)
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Lag_2'], mode='lines', name='Lag 2'), row=2, col=1)
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Lag_3'], mode='lines', name='Lag 3'), row=2, col=2)
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Momentum_factor'], mode='lines', name='Momentum_factor'), row=3, col=1)
fig.add_trace(go.Scatter(x=merged_df.index, y=merged_df['Volume'], mode='lines', name='Volume'), row=3, col=2)

y_labels = ['Returns', 'First Lag', 'Second Lag', 'Third Lag', 'Momentum', 'Trade Volume']

for i in range(6):
    fig.update_xaxes(title_text="Date", row=(i // 2) + 1, col=(i % 2) + 1)
    fig.update_yaxes(title_text=y_labels[i], row=(i // 2) + 1, col=(i % 2) + 1)

# Update layout for better visualization
fig.update_layout(height=800, width=1000, showlegend=True)

# Show the figure
fig.show()

In [251]:
merged_df.to_csv("nvda_features.csv")

In [253]:
start_date = "2019-01-07"

# Filter the DataFrame to start from the specified date
df_cleaned = df_cleaned[df_cleaned.index >= pd.to_datetime(start_date)]
df_cleaned

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RMW,CMA,RF,ADS_Index
Date_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-07,0.94,0.87,-0.75,-0.72,-0.45,0.010,-0.106551
2019-01-08,1.01,0.44,-0.63,0.31,-0.09,0.010,-0.143292
2019-01-09,0.56,0.50,0.10,0.08,-0.20,0.010,-0.179962
2019-01-10,0.42,0.00,-0.46,-0.06,-0.04,0.010,-0.216575
2019-01-11,-0.01,0.20,0.22,0.23,0.25,0.010,-0.251807
...,...,...,...,...,...,...,...
2023-12-22,0.21,0.61,0.09,-0.64,0.19,0.021,-0.704450
2023-12-26,0.48,0.81,0.46,-0.34,-0.15,0.021,-0.722956
2023-12-27,0.16,0.16,0.12,-0.31,-0.14,0.021,-0.740778
2023-12-28,-0.01,-0.38,0.03,-0.32,0.15,0.021,-0.757922


In [254]:
nvda_final_data = pd.merge(merged_df, df_cleaned, left_index=True, right_index=True, how='inner')
nvda_final_data

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3,Momentum_factor,Volume,Mkt-RF,SMB,HML,RMW,CMA,RF,ADS_Index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-07,0.052941,0.064067,-0.060417,0.000000,0.056591,70916000,0.94,0.87,-0.75,-0.72,-0.45,0.010,-0.106551
2019-01-08,-0.024895,0.052941,0.064067,-0.060417,0.092113,78601600,1.01,0.44,-0.63,0.31,-0.09,0.010,-0.143292
2019-01-09,0.019667,-0.024895,0.052941,0.064067,0.047712,61726000,0.56,0.50,0.10,0.08,-0.20,0.010,-0.179962
2019-01-10,0.018586,0.019667,-0.024895,0.052941,0.013358,52315600,0.42,0.00,-0.46,-0.06,-0.04,0.010,-0.216575
2019-01-11,0.024788,0.018586,0.019667,-0.024895,0.063041,87476400,-0.01,0.20,0.22,0.23,0.25,0.010,-0.251807
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445,-0.015094,25213900,0.21,0.61,0.09,-0.64,0.19,0.021,-0.704450
2023-12-26,0.009195,-0.003266,0.018270,-0.030098,0.024200,24420000,0.48,0.81,0.46,-0.34,-0.15,0.021,-0.722956
2023-12-27,0.002800,0.009195,-0.003266,0.018270,0.008730,23364800,0.16,0.16,0.12,-0.31,-0.14,0.021,-0.740778
2023-12-28,0.002125,0.002800,0.009195,-0.003266,0.014120,24658700,-0.01,-0.38,0.03,-0.32,0.15,0.021,-0.757922


In [255]:
nvda_final_data.to_csv("nvda_final_dataset.csv")

In [351]:
nvda_final = pd.read_csv("nvda_final_dataset.csv")
nvda_final.set_index('Date', inplace=True)
nvda_final

Unnamed: 0_level_0,NVDA_Returns,Lag_1,Lag_2,Lag_3,Momentum_factor,Volume,Mkt-RF,SMB,HML,RMW,CMA,RF,ADS_Index
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2019-01-07,0.052941,0.064067,-0.060417,0.000000,0.056591,70916000,0.94,0.87,-0.75,-0.72,-0.45,0.010,-0.106551
2019-01-08,-0.024895,0.052941,0.064067,-0.060417,0.092113,78601600,1.01,0.44,-0.63,0.31,-0.09,0.010,-0.143292
2019-01-09,0.019667,-0.024895,0.052941,0.064067,0.047712,61726000,0.56,0.50,0.10,0.08,-0.20,0.010,-0.179962
2019-01-10,0.018586,0.019667,-0.024895,0.052941,0.013358,52315600,0.42,0.00,-0.46,-0.06,-0.04,0.010,-0.216575
2019-01-11,0.024788,0.018586,0.019667,-0.024895,0.063041,87476400,-0.01,0.20,0.22,0.23,0.25,0.010,-0.251807
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22,-0.003266,0.018270,-0.030098,-0.009445,-0.015094,25213900,0.21,0.61,0.09,-0.64,0.19,0.021,-0.704450
2023-12-26,0.009195,-0.003266,0.018270,-0.030098,0.024200,24420000,0.48,0.81,0.46,-0.34,-0.15,0.021,-0.722956
2023-12-27,0.002800,0.009195,-0.003266,0.018270,0.008730,23364800,0.16,0.16,0.12,-0.31,-0.14,0.021,-0.740778
2023-12-28,0.002125,0.002800,0.009195,-0.003266,0.014120,24658700,-0.01,-0.38,0.03,-0.32,0.15,0.021,-0.757922


## CAPM

In [359]:
import statsmodels.api as sm
import numpy as np
from sklearn.metrics import mean_squared_error


X = nvda_final[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'ADS_Index']]
y = nvda_final['NVDA_Returns']
X = sm.add_constant(X)
# Fit the model
model = sm.OLS(y, X).fit()

# Predict the NVDA_Adj_Close values
predictions = model.predict(X)

# Calculate RMSE
rmse_capm = np.sqrt(mean_squared_error(y, predictions))
print(f'RMSE: {rmse_capm}')
model = sm.OLS(y, X).fit()
print(model.summary())
r2_capm = model.rsquared

fig = go.Figure()

# Actual values
fig.add_trace(go.Scatter(x=y.index, y=y, mode='lines', name='Actual NVDA Returns'))

# Predicted values
fig.add_trace(go.Scatter(x=predictions.index, y=predictions, mode='lines', name='Predicted NVDA Returns', line=dict(dash='dash', color='red')))

# Update layout for better visualization
fig.update_layout(title='Actual vs. Predicted NVDA Returns',
                  xaxis_title='Date',
                  yaxis_title='NVDA Returns')

# Show the figure
fig.show()


RMSE: 0.019823989634438554
                            OLS Regression Results                            
Dep. Variable:           NVDA_Returns   R-squared:                       0.629
Model:                            OLS   Adj. R-squared:                  0.627
Method:                 Least Squares   F-statistic:                     352.1
Date:                Wed, 17 Apr 2024   Prob (F-statistic):          2.81e-264
Time:                        12:01:59   Log-Likelihood:                 3139.9
No. Observations:                1255   AIC:                            -6266.
Df Residuals:                    1248   BIC:                            -6230.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0016    

## ARIMA

In [360]:
# Split the data into training and testing sets
X = nvda_final[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'ADS_Index']]
y = nvda_final['NVDA_Returns']

In [361]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Define the model
model = SARIMAX(y, exog=X, order=(1,0,1))

# Fit the model
model_fit = model.fit(disp=False)

# Print the summary of the model
print(model_fit.summary())

# Predict the returns using the fitted model
predicted_returns = model_fit.predict(exog=X)

# Calculate the RMSE
rmse_arima = np.sqrt(mean_squared_error(y, predicted_returns))
print(f'RMSE: {rmse_arima}')

# Calculate R-squared
ss_res = np.sum((y - predicted_returns) ** 2)
ss_tot = np.sum((y - np.mean(y)) ** 2)
r2_arima = 1 - (ss_res / ss_tot)
print(f'R-squared: {r2_arima}')

# Create a Plotly figure
fig = go.Figure()

# Add actual returns trace
fig.add_trace(go.Scatter(x=y.index, y=y, mode='lines', name='Actual Returns'))

# Add predicted/fitted returns trace
fig.add_trace(go.Scatter(x=predicted_returns.index, y=predicted_returns, mode='lines', name='Fitted Returns', line=dict(color='blue', dash='dash')))

# Update layout for a better look
fig.update_layout(
    title='Augmented Autoregression Model Fitted vs Actual Returns',
    xaxis_title='Date',
    yaxis_title='Returns',
    template='ggplot2',
    legend_title_text='Legend'
)

# Show the interactive figure
fig.show()






                               SARIMAX Results                                
Dep. Variable:           NVDA_Returns   No. Observations:                 1255
Model:               SARIMAX(1, 0, 1)   Log Likelihood                3136.739
Date:                Wed, 17 Apr 2024   AIC                          -6255.478
Time:                        12:02:09   BIC                          -6209.263
Sample:                             0   HQIC                         -6238.108
                               - 1255                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Mkt-RF         0.0161      0.000     35.668      0.000       0.015       0.017
SMB           -0.0008      0.001     -0.928      0.354      -0.002       0.001
HML           -0.0063      0.001     -8.440      0.0

## FAMA-FRNCH THREE FACTOR MODEL

In [366]:
# Define the independent variables (factors) and add a constant term
X = nvda_final[['Mkt-RF', 'SMB', 'HML']]
X = sm.add_constant(X)

# Define the dependent variable (NVDA returns)
y = nvda_final['NVDA_Returns']

# Fit the model
model = sm.OLS(y, X).fit()

# Print the summary of the model
print(model.summary())

# Predict the returns using the model
predicted_returns = model.predict(X)

# Calculate the RMSE
r2_ff = model.rsquared
rmse_ff = np.sqrt(mean_squared_error(y, predicted_returns))
print(f'RMSE: {rmse_ff}')

# Create a Plotly figure
fig = go.Figure()

# Add actual returns trace
fig.add_trace(go.Scatter(
    x=nvda_final.index, 
    y=y, 
    mode='lines', 
    name='Actual Returns'
))

# Add fitted returns trace
fig.add_trace(go.Scatter(
    x=nvda_final.index, 
    y=model.fittedvalues, 
    mode='lines', 
    name='Fitted Returns',
    line=dict(color='blue')
))

# Update the layout for better visualization
fig.update_layout(
    title='Fama-French Three-Factor Model Fitted vs Actual Returns',
    xaxis_title='Date',
    yaxis_title='Returns',
    template='ggplot2',
    legend_title='Legend'
)

# Show the figure
fig.show()

                            OLS Regression Results                            
Dep. Variable:           NVDA_Returns   R-squared:                       0.618
Model:                            OLS   Adj. R-squared:                  0.617
Method:                 Least Squares   F-statistic:                     675.3
Date:                Wed, 17 Apr 2024   Prob (F-statistic):          5.64e-261
Time:                        12:08:38   Log-Likelihood:                 3122.6
No. Observations:                1255   AIC:                            -6237.
Df Residuals:                    1251   BIC:                            -6217.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0017      0.001      2.925      0.0

## COMPARISION OF RMSE AND R-SQUARED ACROSS MODELS

In [367]:
# RMSE values and model names
rmse_v = [rmse_capm, rmse_arima, rmse_ff]
r2_v = [r2_capm, r2_arima, r2_ff]
models = ['CAPM', 'ARIMA', 'FamaFrench']


# Create the DataFrame for plotting
data = pd.DataFrame({
    'Model': models * 2,
    'Metric Value': rmse_v + r2_v,
    'Metric': ['RMSE'] * len(models) + ['R-squared'] * len(models)
})

# Create a Plotly figure
fig = go.Figure()

# Add RMSE bars
fig.add_trace(go.Bar(
    x=data['Model'][:len(models)],
    y=data['Metric Value'][:len(models)],
    name='RMSE',
    marker_color='red'
))

# Add R-squared bars
fig.add_trace(go.Bar(
    x=data['Model'][len(models):],
    y=data['Metric Value'][len(models):],
    name='R-squared',
    marker_color='blue'
))

# Update the layout
fig.update_layout(
    title='Comparison of RMSE and R-squared Values Across Models',
    xaxis=dict(title='Model', tickangle=-45),
    yaxis=dict(title='Value'),
    barmode='group',  
    legend_title_text='Metric')

# Show the figure
fig.show()


## GARCH

In [578]:
Y = np.diff(np.log(df['Adj Close'].values))
print(Y)
T = Y.shape[0]
print(T)

[-0.06231906  0.06209866  0.05158693 ...  0.00279642  0.00212253
  0.        ]
1257


In [579]:
def garch(param, *args):
    # Initialize Params
    mu = param[0]
    omega = param[1]
    alpha = param[2]
    beta = param[3]
    GARCH_Dens, sigma2, F, v = {}, {}, {}, {}
    # intialize values
    sigma2[0] = np.var(Y)
    Likelihood = 0
    for t in range(1,T):
        sigma2[t] = omega + alpha*((Y[t-1]-mu)**2)+beta*(sigma2[t-1])
        F[t] = Y[t] - mu-np.sqrt(sigma2[t])*np.random.normal(0,1,1)
        v[t] = sigma2[t]
        GARCH_Dens[t] = (1/2)*np.log(2*np.pi)+(1/2)*np.log(v[t])+\
                    (1/2)*(F[t]/v[t])
        Likelihood += GARCH_Dens[t]

    return Likelihood

In [580]:
def garch_path(params, *args):
    mu = params[0]
    omega = params[1]
    alpha = params[2]
    beta = params[3]
    sigma2, path = {}, {}
    # initialzie volatility and path
    sigma2[0] = np.var(Y)
    path[0] = np.array([Y[0]])
    for t in range(1,T):
        sigma2[t] = omega + alpha*((Y[t-1]-mu)**2)+beta*(sigma2[t-1]);
        path[t] = mu+np.sqrt(sigma2[t])*np.random.normal(0,1,1)

    return path, sigma2

In [581]:
param0 = np.array([np.mean(Y), 5.746493354803076e-05, 0.01, 0.1])
results = minimize(garch, param0, method='BFGS', options={'xtol': 1e-8, 'disp': True})
param_star = results.x

path, vol = garch_path(param_star, Y, T)
#path_values_as_arrays = [np.array(value) for value in path.values()]
Y_GARCH = np.hstack(list(path.values()))


# Create a Plotly figure
fig = go.Figure()

original_data_trace = go.Scatter(
    x=timevec,
    y=Y,
    mode='lines',
    name='Original Data',
    line=dict(color='blue')
)
garch_model_trace = go.Scatter(
    x=timevec,
    y=Y_GARCH,
    mode='lines',
    name='GARCH Model',
    line=dict(color='red')
)

# Layout configuration
layout = go.Layout(
    title=f'Volatility Prediction for {stock_symbol}',
    xaxis=dict(title='Time'),
    yaxis=dict(title='Log Returns'),
    legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
)

# Define the figure
fig = go.Figure(data=[original_data_trace, garch_model_trace], layout=layout)

# Show figure
fig.show()

# Calculate RMSE
RMSE = np.sqrt(np.mean((Y_GARCH - Y)**2))
print('RMSE value for daily basis is: ', RMSE)

         Current function value: -7487.806200
         Iterations: 3
         Function evaluations: 145
         Gradient evaluations: 26


RMSE value for daily basis is:  0.03334488969189804
