# Data Preparation

In [26]:
import pandas as pd
import yfinance as yf
import datetime
from dateutil.relativedelta import relativedelta

import ta
from statistics import mean
from sklearn.preprocessing import MinMaxScaler

In [27]:
def SMA(df, n, close):
    return pd.Series(df[close]).rolling(n).mean()

In [28]:
def getdata(sym, sma_value, close):
    today = datetime.datetime.now()
    df=yf.download(sym, start=datetime.datetime(today.year-2, 1, 1),
                   end=datetime.datetime(today.year, today.month, today.day))
    pd.set_option('display.max_columns', None)

    df = df[['Open','High','Low',close,'Volume']]
    df = df.reset_index().rename(columns={'index':'Date'})
    df['ticker'] = sym

    for i in sma_value:
        variable_name = 'sma_' + str(i)
        df[variable_name] = SMA(df, i, close)

    return df

In [29]:
df = getdata('NVDA',[20, 50],'Close')

[*********************100%***********************]  1 of 1 completed


In [30]:
df['rsi'] = ta.momentum.rsi(df['Close'])

In [31]:
df['macd'] = ta.trend.macd(df['Close'])

In [32]:
# Calculate the Bollinger Bands
bb = ta.volatility.BollingerBands(close=df['Close'], window=20, window_dev=2)

# Retrieve the upper, middle, and lower bands
df['upper_band'] = bb.bollinger_hband()
df['middle_band'] = bb.bollinger_mavg()
df['lower_band'] = bb.bollinger_lband()

In [33]:
df['stoch_oscillator'] = ta.momentum.stoch(high=df['High'], low=df['Low'], close=df['Close'])

# Remove NA

In [35]:
df = df.dropna(subset=['sma_20','sma_50','stoch_oscillator'])

In [36]:
df['next_day_close'] = df['Close'].shift(-1)

In [37]:
df = df.dropna(subset=['next_day_close'])

In [38]:
df = df.drop(columns='ticker')

# Scale df

In [39]:
scaler = MinMaxScaler()
scaler.fit(df.drop(columns=['Date']))
scaled_data = scaler.transform(df.drop(columns=['Date']))
df_scaled = pd.DataFrame(scaled_data, columns=df.drop(columns=['Date']).columns)

In [40]:
df_reset_index = df.reset_index(drop=True).reset_index()

In [41]:
df_scaled = df_scaled.reset_index()

In [42]:
df_scaled.index

RangeIndex(start=0, stop=502, step=1)

In [43]:
df_scaled_full = pd.merge(left=df_reset_index[['index','Date']],right=df_scaled, on='index')

In [44]:
df_scaled_full = df_scaled_full.drop(columns='index')

In [46]:
df_scaled_full

Unnamed: 0,Date,Open,High,Low,Close,Volume,sma_20,sma_50,rsi,macd,upper_band,middle_band,lower_band,stoch_oscillator,next_day_close
0,2022-03-15 00:00:00-04:00,0.125310,0.131944,0.133382,0.144225,0.235514,0.163077,0.209900,0.336474,0.110745,0.157218,0.163077,0.175405,0.578067,0.162926
1,2022-03-16 00:00:00-04:00,0.148859,0.150143,0.156862,0.162926,0.365913,0.161623,0.207871,0.448473,0.134797,0.152077,0.161623,0.179425,0.958282,0.166241
2,2022-03-17 00:00:00-04:00,0.156225,0.153003,0.166178,0.166241,0.220939,0.160354,0.206239,0.466829,0.157060,0.146637,0.160354,0.184347,0.982300,0.186955
3,2022-03-18 00:00:00-04:00,0.164292,0.173163,0.175291,0.186955,0.409008,0.161769,0.205824,0.571558,0.191393,0.152352,0.161769,0.179379,0.980829,0.190405
4,2022-03-21 00:00:00-04:00,0.184585,0.179968,0.192337,0.190405,0.307994,0.164019,0.205304,0.587478,0.221173,0.159211,0.164019,0.174783,0.936034,0.187827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
497,2024-03-07 00:00:00-05:00,0.940832,0.945917,1.000000,1.000000,0.319907,0.949111,0.942886,0.976414,1.000000,0.942575,0.949111,0.959028,0.996768,0.936875
498,2024-03-08 00:00:00-05:00,1.000000,1.000000,0.960705,0.936875,0.701365,0.962124,0.956681,0.729753,0.991204,0.955669,0.962124,0.971862,0.682834,0.915339
499,2024-03-11 00:00:00-04:00,0.896527,0.899574,0.931006,0.915339,0.370958,0.972047,0.969793,0.658756,0.958367,0.964615,0.972047,0.983192,0.626397,0.990717
500,2024-03-12 00:00:00-04:00,0.915775,0.936497,0.956187,0.990717,0.363481,0.986353,0.985081,0.766502,0.983521,0.985348,0.986353,0.987902,0.763206,0.978132


# Company Fundamentals

In [58]:
import requests
import json

# replace the "demo" apikey below with your own key from https://www.alphavantage.co/support/#api-key
url = 'https://www.alphavantage.co/query?function=INCOME_STATEMENT&symbol=IBM&apikey=M482KPOEHSDW3G1M'
r = requests.get(url)
data = r.json()

df_normalse = pd.json_normalize(data)

In [59]:
df_quarterly= pd.json_normalize(df_normalse['quarterlyReports'])

In [64]:
df_normalise_2023_q4 = pd.json_normalize(df_quarterly[0])
df_normalise_2023_q3 = pd.json_normalize(df_quarterly[1])
df_normalise_2023_q2 = pd.json_normalize(df_quarterly[2])
df_normalise_2023_q1 = pd.json_normalize(df_quarterly[3])
df_normalise_2022_q4 = pd.json_normalize(df_quarterly[4])
df_normalise_2022_q3 = pd.json_normalize(df_quarterly[5])
df_normalise_2022_q2 = pd.json_normalize(df_quarterly[6])
df_normalise_2022_q1 = pd.json_normalize(df_quarterly[7])


In [65]:
df_comp_fund = pd.concat([df_normalise_2023_q4,df_normalise_2023_q3,df_normalise_2023_q2,df_normalise_2023_q1,
           df_normalise_2022_q4,df_normalise_2022_q3,df_normalise_2022_q2,df_normalise_2022_q1])
           

In [66]:
df_comp_fund = df_comp_fund.drop(columns='reportedCurrency')

In [67]:
scaler = MinMaxScaler()
scaler.fit(df_comp_fund.drop(columns=['fiscalDateEnding','investmentIncomeNet']))
scaled_data = scaler.transform(df_comp_fund.drop(columns=['fiscalDateEnding','investmentIncomeNet']))
df_comp_fund_scaled = pd.DataFrame(scaled_data, columns=df_comp_fund.drop(columns=['fiscalDateEnding','investmentIncomeNet']).columns)

In [68]:
df_comp_fund_reset_index = df_comp_fund.reset_index(drop=True).reset_index()

In [69]:
df_comp_fund_scaled = df_comp_fund_scaled.reset_index()

In [70]:
df_comp_fund= df_comp_fund_reset_index[['index','fiscalDateEnding']].merge(df_comp_fund_scaled, on='index')

In [71]:
df_comp_fund = df_comp_fund.drop(columns = 'index')

In [72]:
df_comp_fund['fiscalDateEnding'] = pd.to_datetime(df_comp_fund['fiscalDateEnding'])

# Merge Data

In [73]:
df_scaled_full['quarter'] = df_scaled_full['Date'].dt.to_period('Q')

In [74]:
df_comp_fund['quarter'] = df_comp_fund['fiscalDateEnding'].dt.to_period('Q')

In [75]:
df = df_scaled_full.merge(df_comp_fund, on='quarter', how='left')

In [76]:
#df = df_scaled_full

# Feature engineering

In [77]:
# Calculate the correlation coefficients between market indicators and stock prices
correlation_matrix = df.corr()
correlation_with_stock_prices = correlation_matrix['next_day_close'].dropna()
relevant_indicators = correlation_with_stock_prices[abs(correlation_with_stock_prices) > 0.5].index.tolist()

# Identify relevant indicators

In [78]:
relevant_indicators

['Open',
 'High',
 'Low',
 'Close',
 'sma_20',
 'sma_50',
 'rsi',
 'macd',
 'upper_band',
 'middle_band',
 'lower_band',
 'next_day_close',
 'researchAndDevelopment',
 'netInterestIncome',
 'interestIncome',
 'interestExpense',
 'depreciation',
 'interestAndDebtExpense']

In [79]:
import statsmodels.formula.api as sm


# Train Model

In [89]:
# Check for NaN values
nan_locations = df.isna().any()

# Get the columns with NaN values
columns_with_nan = nan_locations[nan_locations].index.tolist()

# Get the rows with NaN values
rows_with_nan = df[df.isna().any(axis=1)].index.tolist()


In [94]:
df = df.fillna(0)

In [95]:
df = df[relevant_indicators]

In [96]:
train_size = int(len(df) * 0.8)  # 80% for training
train_data = df[:train_size]
test_data = df[train_size:]

In [97]:
train_data_endog = train_data['next_day_close']
train_data_exog = train_data.drop(columns='next_day_close')

In [98]:
test_data_endog = test_data['next_day_close']
test_data_exog = test_data.drop(columns='next_day_close')

# ARIMA Model

In [99]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.arima.model import ARIMA
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

# Specify the range of orders to try
p_values = range(3)  # example: try orders (0, 0, 0), (1, 0, 0), and (2, 0, 0)
d_values = range(2)  # example: try orders (1, 0, 0) and (1, 1, 0)
q_values = range(3)  # example: try orders (1, 0, 0), (1, 0, 1), and (1, 0, 2)

best_aic = np.inf
best_order = None

# Iterate over different orders and select the best model based on AIC
for p in p_values:
    for d in d_values:
        for q in q_values:
            try:
                model = ARIMA(endog=train_data_endog,exog=train_data_exog, order=(p, d, q))
                results = model.fit()
                aic = results.aic

                if aic < best_aic:
                    best_aic = aic
                    best_order = (p, d, q)
            except:
                continue

print("best_order= {}".format(best_order))

# Create the ARIMA model with selected indicators
model = ARIMA(train_data['next_day_close'], order=(p, d, q))
results = model.fit()

best_order= (0, 0, 0)


In [101]:
from sklearn.metrics import accuracy_score

# Forecast on the test data
forecast = results.get_forecast(steps=len(test_data), exog=test_data_exog)

# Get the predicted values
predicted_values = forecast.predicted_mean

# Calculate the price change from the predicted values
predicted_price_change = np.diff(predicted_values)

# Create a binary target variable indicating if the price change is positive (1) or not (0)
predicted_price_up = (predicted_price_change > 0).astype(int)

# Calculate the actual price change from the test data
actual_price_change = np.diff(test_data['next_day_close'])

# Create a binary target variable for the actual price change
actual_price_up = (actual_price_change > 0).astype(int)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(actual_price_up, predicted_price_up)

# Print the accuracy
print(f"Accuracy: {accuracy}")

Accuracy: 0.38


# SARIMA Model

In [102]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Specify the order and seasonal order of the SARIMA model
order = (0, 0, 0)  # (p, d, q)
seasonal_order = (1, 0, 0, 6)  # (P, D, Q, seasonal_periods)

# Create the SARIMA model
model = SARIMAX(endog=train_data_endog, exog=train_data_exog, order=order, seasonal_order=seasonal_order)

# Fit the SARIMA model
results = model.fit()

# Forecast on the test data
forecast = results.get_forecast(steps=len(test_data), exog=test_data_exog)

# Get the predicted mean values
predicted_values = forecast.predicted_mean

# Calculate the price change from the predicted values
predicted_price_change = np.diff(predicted_values)

# Create a binary target variable indicating if the price change is positive (1) or not (0)
predicted_price_up = (predicted_price_change > 0).astype(int)

# Calculate the actual price change from the test data
actual_price_change = np.diff(test_data['next_day_close'])

# Create a binary target variable for the actual price change
actual_price_up = (actual_price_change > 0).astype(int)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(actual_price_up, predicted_price_up)

# Print the accuracy
print(f"Accuracy: {accuracy}")

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =           19     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -3.11799D+00    |proj g|=  4.83744D-01


 This problem is unconstrained.



At iterate    5    f= -3.11801D+00    |proj g|=  2.36340D-01

At iterate   10    f= -3.11802D+00    |proj g|=  2.46204D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
   19     12     17      1     0     0   1.250D-02  -3.118D+00
  F =  -3.1180157226170260     

CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH             
Accuracy: 0.54
