In [3]:
import yfinance as yf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import ta
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.datasets import load_diabetes

In [4]:
def get_sma(close,windows):
    df = pd.DataFrame()
    for window in windows:
        df['SMA_'+str(window)] = ta.trend.SMAIndicator(close,window).sma_indicator()
    return df

In [5]:
def get_atr(low,high,close,windows):
    df = pd.DataFrame()
    for window in windows:
        df['ATR_'+str(window)] = ta.volatility.AverageTrueRange(high,low,close,window).average_true_range()
    return df

In [6]:
def get_rsi(close,windows):
    df = pd.DataFrame()
    for window in windows:
        df['RSI_'+str(window)] = ta.momentum.RSIIndicator(close,window).rsi()
    return df

In [7]:
def get_mfi(low,high,close,volume,windows):
    df = pd.DataFrame()
    for window in windows:
        df['MFI_'+str(window)] = ta.volume.MFIIndicator(high,low,close,volume,window).money_flow_index()
    return df

In [16]:
def get_data(start,end,TI_windows):
    
    print("DOWNLOADING THE DATA FROM YAHOO FINANCE ....")
    gold = yf.download('GC=F',interval='1d',start=start, end=end)
    gold.pop('Adj Close')
    dataframe = pd.DataFrame()
    for i in range(1,10):
        gold['t-'+str(i)] = gold.Close.shift(i)
    
    ## getting tchnical indicators
    print("ADDING TECHNICAL INDICATORS ...")
    
    ## Volume indicators
    mfi_df = get_mfi(gold.Low,gold.High,gold.Close,gold.Volume,TI_windows)
    ## Volatility indicators
    atr_df = get_atr(gold.Low,gold.High,gold.Close,TI_windows)
    ## Trend indicators
    sma_df = get_sma(gold.Close,TI_windows)
    ## Momentum indicators
    rsi_df = get_rsi(gold.Close,TI_windows)
    
    gold = pd.concat([gold, sma_df,atr_df,rsi_df,mfi_df], axis=1)
    
    ## SELECT THE BEST FEATURES
    gold = gold.dropna()
    print("SELECTING THE BEST FEATURES ....")
    X = gold.drop(columns=['Close'])
    y = gold.Close
    # fit random forest model
    model = RandomForestRegressor(n_estimators=500, random_state=42)
    model.fit(X, y)
    names = X.columns
    fs_rf = names[model.feature_importances_ > 0.001]
    
    X = gold.drop(columns=['Close'])
    y = gold.Close
    features = X.columns
    
    pipeline = Pipeline([
                         ('scaler',MinMaxScaler()),
                         ('model',Lasso(tol=0.01))
    ])
    
    search = GridSearchCV(pipeline,
                          {'model__alpha':np.arange(0.1,10,0.1)},
                          cv = 5, scoring="neg_mean_squared_error",verbose=0
                          )

    search.fit(X,y)
    search.best_params_
    coefficients = search.best_estimator_.named_steps['model'].coef_
    importance = np.abs(coefficients)
    
    fs_lass = np.array(features)[importance > 0]
    selected_features = np.intersect1d(fs_rf,fs_lass)
    final_data = gold[selected_features].copy()
    final_data['Close'] = gold.Close
    return final_data