In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
TICKER = 'AMZN'
START_DATE = "2017-01-01"
END_DATE = "2022-12-31"
LOOKBACK_LAGS = [1, 2, 3, 5, 7, 14, 21, 30, 120, 180, 252]
TRAIN_TEST_SPLIT_RATIO = 0.8
ExternalTickers = ['^GSPC', '^VIX', '^TNX', 'SHOP'] #S&P 500, VIX (Volatility Index), and 10-Year Treasury Yield

In [None]:
def fetch_data(ticker, start, end):
    print(f"Fetching data for {ticker} from {start} to {end}...")
    data = yf.download(ticker, start=start, end=end)
    
    # Flatten columns if MultiIndex (which happens sometimes with yfinance)
    if isinstance(data.columns, pd.MultiIndex):
        data.columns = data.columns.get_level_values(0)
    
    data = data[['Open', 'High', 'Low', 'Close', 'Volume']]
    data = data.ffill()
    data.index = pd.to_datetime(data.index)
    
    
    print("Data fetched successfully.")
    return data



def fetch_multiple_tickers(tickers, start, end):
    print(f"Fetching data for {len(tickers)} external tickers...")
    data_dict = {}
    for ticker in tickers:
        try:
            data = yf.download(ticker, start=start, end=end)
            data = data[['Close']].ffill()
            data.columns = [f'{ticker}_Close']
            data.index = pd.to_datetime(data.index)
            data_dict[ticker] = data
            print(f"✓ {ticker}")
        except Exception as e:
            print(f"✗ {ticker} - Error: {str(e)}")
    print(f"Successfully fetched {len(data_dict)}/{len(tickers)} tickers")
    return data_dict

In [None]:
def compute_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()
    rs = avg_gain / avg_loss.replace(0, 0.000001)
    return 100 - (100 / (1 + rs))

def compute_bollinger_bands(series, window=20, num_std=2):
    middle = series.rolling(window).mean()
    std = series.rolling(window).std()
    return (middle + (std * num_std), (middle - (std * num_std)))

def compute_macd(series, short=12, long=26, signal=9):
    short_ema = series.ewm(span=short, adjust=False).mean()
    long_ema = series.ewm(span=long, adjust=False).mean()
    macd = short_ema - long_ema
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    return macd, signal_line, macd - signal_line

def create_calendar_features(df):
    return pd.DataFrame({
        'day_of_week': df.index.dayofweek,
        'month': df.index.month,
        'quarter': df.index.quarter
    }, index=df.index)

In [None]:

def create_features_with_external(df, lags, external_tickers):
    print("\nCreating features...")
    main_close_col = f'{TICKER}_Close'
    
    features = create_calendar_features(df)
    
    # Target variable (next day's close)
    features['Target'] = df[main_close_col].shift(-1)
    
    # Price lags
    for lag in lags:
        features[f'lag_{lag}'] = df[main_close_col].shift(lag)
    
    # Technical indicators on main ticker close price
    features['ma5'] = df[main_close_col].rolling(5).mean()
    features['ma10'] = df[main_close_col].rolling(10).mean()
    features['rsi14'] = compute_rsi(df[main_close_col])
    
    macd, signal, _ = compute_macd(df[main_close_col])
    features['macd'] = macd
    features['macd_signal'] = signal
    
    upper_bb, lower_bb = compute_bollinger_bands(df[main_close_col])
    features['upper_bb'] = upper_bb
    features['lower_bb'] = lower_bb
    
    # External ticker features
    for ticker in external_tickers:
        col = f'{ticker}_Close'
        if col in df.columns:
            features[f'{ticker}_close'] = df[col]
            features[f'{ticker}_ma5'] = df[col].rolling(5).mean()
            features[f'{ticker}_rel'] = df[main_close_col] / df[col]
    
    features.dropna(inplace=True)
    print(f"Created {len(features.columns)} features.")
    return features


In [None]:

def train_and_evaluate(data):
    print("\nTraining model...")
    X = data.drop('Target', axis=1)
    y = data['Target']
    
    split_idx = int(len(data) * TRAIN_TEST_SPLIT_RATIO)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]
    
    model = RandomForestRegressor(
        n_estimators=1000,
        max_depth=30,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    print("\nModel Performance:")
    print(f"MAE: {mean_absolute_error(y_test, preds):.2f}")
    print(f"MSE: {mean_squared_error(y_test, preds):.2f}") 
    print(f"R²: {r2_score(y_test, preds):.4f}")
    
    return model, X_test, y_test, preds

In [None]:

def plot_results(y_test, y_pred):
    plt.figure(figsize=(14, 6))
    plt.plot(y_test.index, y_test, label='Actual', color='blue')
    plt.plot(y_test.index, y_pred, label='Predicted', color='orange', linestyle='--')
    plt.title(f"{TICKER} Price Predictions", fontsize=16)
    plt.xlabel("Date")
    plt.ylabel("Price ($)")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.show()

def plot_importances(model, features, n=20):
    importances = pd.Series(model.feature_importances_, index=features)
    plt.figure(figsize=(10, 6))
    importances.sort_values().tail(n).plot.barh()
    plt.title(f"Top {n} Important Features")
    plt.tight_layout()
    plt.show()

def merge_external_data(main_data, external_data):
    print("Merging main data with external data...")
    all_data = [main_data] + list(external_data.values())
    combined = pd.concat(all_data, axis=1)
    print("Data merged successfully.")
    return combined

In [None]:

print("=== Data Collection ===")
main_data = fetch_data(TICKER, START_DATE, END_DATE)
print("Main data columns:", main_data.columns.tolist())  # Debugging: check columns



if isinstance(main_data.columns, pd.MultiIndex):
    main_data.columns = main_data.columns.get_level_values(-1)

# Rename main_data columns with ticker prefix to match external data format
main_data = main_data.add_prefix(f"{TICKER}_")


external_data = fetch_multiple_tickers(ExternalTickers, START_DATE, END_DATE)

combined_data = merge_external_data(main_data, external_data)

print("Combined columns:", combined_data.columns.tolist())  # Debugging: check columns

print("\n=== Feature Engineering ===")
featured_data = create_features_with_external(combined_data, LOOKBACK_LAGS, ExternalTickers)

print("\n=== Model Training ===")
model, X_test, y_test, preds = train_and_evaluate(featured_data)

print("\n=== Visualization ===")
plot_results(y_test, preds)
plot_importances(model, X_test.columns)

print("\nFeature List:")
for i, col in enumerate(X_test.columns):
    print(f"{i+1}. {col}")