In [None]:
import pandas as pd
from fredapi import Fred
import json
import yfinance as yf
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import logging
from sklearn.preprocessing import PolynomialFeatures,StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV,train_test_split,TimeSeriesSplit
from sklearn.pipeline import Pipeline
import scipy.stats as stats

warnings.filterwarnings('ignore')
plt.style.use('seaborn')

with open('configs.json','r') as file:
    key = json.load(file)['fred_api_key']
    
fred = Fred(key)

class GetData:
    def __init__(self,ticker):
        self.folder = 'data'
        self.ticker = ticker
        self.logger = logging.getLogger(__name__)
        if not os.path.exists(self.folder):
            os.makedirs(self.folder)
            
    def get_ticker(self):
        if not os.path.exists(f'{self.folder}/{self.ticker}_OHLC.csv'):
            try:
                ticker_data = yf.download(self.ticker,period='max',interval='1mo',auto_adjust=True,progress=False)
                ticker_data.columns = ['Open','High','Low','Close','Volume']
                ticker_data.to_csv(f'{self.folder}/{self.ticker}_OHLC.csv')
            except Exception as e:
                self.logger.warning(e)
                return None
        else:
            ticker_data = pd.read_csv(f'{self.folder}/{self.ticker}_OHLC.csv',index_col=0,parse_dates=True)
        
        return ticker_data
                
    def get_macroeconomics(self,features):
        merge = []
        for i in features:
            if not os.path.exists(f'{self.folder}/{i}.csv'):
                try:
                    df = fred.get_series(i).rename(i)
                    df.to_csv(f'{self.folder}/{i}.csv')
                    merge.append(df)
                except Exception as e:
                    self.logger.warning(e)
            else:
                df = pd.read_csv(f'{self.folder}/{i}.csv',index_col=0,parse_dates=True)
                merge.append(df)
        
        alldata = pd.concat(merge,join='inner',axis=1)
        return alldata
    
def model(xdata,ydata,tstsize=0.65):
    try:
        x = xdata
        y = np.array(ydata).reshape(-1,1)

        xtrain,xtest,ytrain,ytest = train_test_split(x,y,random_state=0,test_size=tstsize,shuffle=False)

    except Exception as e:
        print(e)
        return None,None

    pipe_inputs = [('scaler',StandardScaler()),('poly',PolynomialFeatures(include_bias=False)),('ridge',Ridge())]

    params = {
        'scaler__with_mean':[True,False],
        'scaler__with_std':[True,False],
        'poly__degree':[1,2,3,4],
        'ridge__alpha':[0.1,1,10,100]
        }

    pipe = Pipeline(pipe_inputs)

    grid = GridSearchCV(pipe,params,cv=TimeSeriesSplit(n_splits=4))

    grid.fit(xtrain,ytrain)
    
    print('\nModel Info')
    print('='*35)
    print(f'best params: {grid.best_params_}\n')
    print(f'best Rsquared Train Score: {grid.best_score_:.2f}')

    model = grid.best_estimator_

    yhat = grid.predict(xtest)
    print(f'best Rsquared Test Score: {r2_score(y_pred=yhat,y_true=ytest):.2f}\n')
    print(f'MSE: {mean_squared_error(y_pred=yhat,y_true=ytest):.2f}\n')

    return yhat,ytest


def plot_eval(estimate,actual,tickername):

    fig,ax = plt.subplots(2,2,figsize=(14,10))
    fig.suptitle('Evaluation')
    ax = ax.ravel()

    sns.kdeplot(x=estimate.flatten(),label='Est.',ax=ax[0])
    sns.kdeplot(x=actual.flatten(),label='Act.',ax=ax[0])
    ax[0].set_ylabel('Density')
    ax[0].set_title('KDE')
    ax[0].legend()

    sns.residplot(x=estimate.flatten(),y=actual.flatten(),ax=ax[1])
    ax[1].set_title('Residuals')
    
    ax[2].plot(np.arange(len(estimate)),estimate.flatten(),label='Est.')
    ax[2].plot(np.arange(len(actual)),actual.flatten(),label='Act.')
    ax[2].set_title(f'{tickername} Price Chart')
    ax[2].set_ylabel('Price(USD)')
    ax[2].legend()
    
    stats.probplot(estimate.flatten(), dist="norm", plot=ax[3])
    ax[3].set_title("Q–Q")
    ax[3].set_ylabel('Price(USD)')

    for a in ax:
        a.set_xlabel('Index(n)')
    
    if not os.path.exists('plots'):
        os.makedirs('plots')
        
    plt.tight_layout()
    plt.savefig(f'plots/econs_{tickername}.png',dpi=300)
    fig.show()

def feature_selection(fdata,tdata):
    df = pd.concat([fdata,tdata],join='inner',axis=1)
    corr = df.corr()
    selection = []
    
    for i in corr.columns.drop('Close'):
        if abs(corr['Close'].loc[i]) >= 0.4:
            selection.append(i)
    
    return fdata[selection]
        
        
if __name__ == '__main__':
    ticker = 'SPY'
    logging.basicConfig(level=logging.WARNING)
    
    obj = GetData(ticker)
    ticker_df = obj.get_ticker()['Close']
    
    if ticker_df is not None:
        feature_df = obj.get_macroeconomics([
            'CPIAUCSL',
            'PPIACO',
            'PCEPI',
            'UMCSENT',
            'PAYEMS',
            'FEDFUNDS',
            'PCEPILFE',
            'UNRATE'
        ])
        
        startdate = ticker_df.index[0]
        enddate = feature_df.index[-1] 
        
        feature_df = feature_df.loc[(feature_df.index>=startdate)]
        ticker_df = ticker_df.loc[ticker_df.index<=enddate]
        
        print('Index Comparison')
        print('='*35)
        print('Features:',feature_df.index[0],'–',feature_df.index[-1])
        print('Ticker:',ticker_df.index[0],'–',ticker_df.index[-1])
        
        feature_df = feature_selection(feature_df,ticker_df)
        
        if len(feature_df) != len(ticker_df):
            print('Lenght mismatch between features and ticker')
            execute = False
        else:
            execute = True
            
        if execute:
            est,act = model(feature_df,ticker_df,0.35)
            if (est,act) is not None:
                plot_eval(est,act,ticker)