In [54]:
import yfinance as yf
import pandas as pd 
import numpy as np
from scipy import stats
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.vector_ar.var_model import VAR

In [55]:
def get_stock_data(ticker, start_date, end_date):
    stock_data = yf.download(ticker, start=start_date, end=end_date)
    return stock_data

def remove_outliers(df):
    z = np.abs(stats.zscore(df))
    threshold = 3
    df = df[(z < threshold).all(axis=1)]
    return df

def engineer_features(df):
    # Calculate daily returns
    df['returns'] = df['Close'].pct_change()
    
    # Calculate the rolling mean and standard deviation
    df['rolling_mean'] = df['Close'].rolling(window=30).mean()
    df['rolling_std'] = df['Close'].rolling(window=30).std()
    
    # Calculate the Bollinger Bands
    df['upper_band'] = df['rolling_mean'] + 2 * df['rolling_std']
    df['lower_band'] = df['rolling_mean'] - 2 * df['rolling_std']
    
    # Calculate the relative strength index (RSI)
    delta = df['Close'].diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=14).mean()
    avg_loss = loss.rolling(window=14).mean()
    rs = avg_gain / avg_loss
    df.loc[:, 'rsi'] = 100 - (100 / (1 + rs))
    
    # Drop any missing values
    df.dropna(inplace=True, how='any')
    
    return df

def normalize_df(df):
    df_norm = (df - df.min()) / (df.max() - df.min())
    return df_norm

def stock_pipeline(ticker, start_date, end_date):
    # Get the stock data
    stock_data = get_stock_data(ticker, start_date, end_date)
    
    # Remove any outliers
    stock_data = remove_outliers(stock_data)
    
    # Engineer new features
    stock_data = engineer_features(stock_data)
    
    # Normalize the data
    stock_data_norm = normalize_df(stock_data)

    # Add lags
    
    return stock_data_norm

#from statsmodels.tsa.vector_ar.var_model import VAR

import statsmodels.tsa.stattools as ts

def find_best_features(df, target_column='Close', max_lags=5):
    best_features = []
    for column in df.columns:
        if column != target_column:
            p_values = []
            for lag in range(1, max_lags + 1):
                result = ts.grangercausalitytests(df[[column, target_column]], maxlag=lag)
                p_values.append(result[0]['ssr_ftest'][1])
            if all(p > 0.05 for p in p_values):
                best_features.append(column)
                continue
            else:
                best_features.append(column + "_" + str(p_values.index(min(p_values[i] for i in range(len(p_values)) if p_values[i] <= 0.05)) + 1))
    return df[best_features + [target_column]]



In [56]:
ticker = 'AAPL'
start = "2018-01-01"
end = "2022-12-31"

df = stock_pipeline(ticker, start, end)
df

[*********************100%***********************]  1 of 1 completed




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,returns,rolling_mean,rolling_std,upper_band,lower_band,rsi
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2018-02-16,0.047492,0.045044,0.048841,0.044046,0.039365,0.518781,0.429821,0.028014,0.074830,0.028599,0.035461,0.481545
2018-02-20,0.046963,0.044084,0.048229,0.043048,0.038414,0.415518,0.429319,0.027989,0.074794,0.028570,0.035442,0.516452
2018-02-21,0.048294,0.043844,0.047513,0.041706,0.037135,0.474065,0.424643,0.027918,0.074744,0.028494,0.035378,0.499471
2018-02-22,0.046537,0.043552,0.048736,0.044166,0.039480,0.366932,0.475954,0.027885,0.074649,0.028448,0.035362,0.574549
2018-02-23,0.049728,0.046467,0.051931,0.049328,0.044401,0.413565,0.511887,0.027916,0.074885,0.028517,0.035350,0.628099
...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-23,0.647025,0.653504,0.654397,0.654858,0.662616,0.118295,0.431568,0.769924,0.476753,0.792957,0.723441,0.249538
2022-12-27,0.650165,0.646577,0.647971,0.642264,0.650041,0.139759,0.387489,0.765099,0.505967,0.793351,0.713097,0.267872
2022-12-28,0.638492,0.643970,0.628065,0.614804,0.622624,0.207674,0.320627,0.759644,0.554665,0.796438,0.698512,0.248358
2022-12-29,0.627024,0.640198,0.641056,0.639373,0.647155,0.167436,0.555381,0.754633,0.570834,0.794458,0.690383,0.293150


In [57]:
df_best_features = find_best_features(df)
df_best_features


Granger Causality
number of lags (no zero) 1
ssr based F test:         F=2479.5241, p=0.0000  , df_denom=1198, df_num=1
ssr based chi2 test:   chi2=2485.7333, p=0.0000  , df=1
likelihood ratio test: chi2=1347.0251, p=0.0000  , df=1
parameter F test:         F=2479.5241, p=0.0000  , df_denom=1198, df_num=1


KeyError: 0