 # Strategy Selection for All Stocks

In [19]:
import pandas as pd
import numpy as np
import yfinance as yf
import os

from anyio.to_interpreter import run_sync
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import warnings
warnings.filterwarnings("ignore")

In [20]:
DATA_PATH = 'Data'
PREDICTION_FOLDER = 'Predictions'

RSI_START_STR = '2024-01-17'
RSI_TEST_START_STR = '2024-03-01'
RSI_END_STR = '2025-01-16'

LNR_RF_TRAIN_START_STR = '2024-01-17'
LNR_RF_TRAIN_END_STR = '2024-02-28'
LNR_RF_TEST_START_STR = '2024-02-29'
LNR_RF_TEST_END_STR = '2025-01-16'

LGR_TRAIN_START_STR = '2021-03-01'
LGR_TRAIN_END_STR = '2024-02-29'
LGR_TEST_START_STR = '2024-03-01'
LGR_TEST_END_STR = '2025-01-16'

MLP_TRAIN_START_STR = '2021-01-01'
MLP_TRAIN_END_STR = '2024-02-29'
MLP_TEST_START_STR = '2024-03-01'
MLP_TEST_END_STR = '2025-01-16'

 ## Strategy 1: RSI

In [21]:
# Function to compute RSI for a given series and window
def compute_RSI(series, window):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    RS = avg_gain / (avg_loss + 1e-10)  # Avoid division by zero
    RSI = 100 - (100 / (1 + RS))
    return RSI

# Function to build the RSI strategy with long and short positions
def rsi_strategy(df, window, oversold=30, overbought=80):
    df['RSI'] = compute_RSI(df['Close'], window)

    # Generate trading signals: +1 for long, -0.5 for short
    df['Signal'] = 0
    df['Signal'] = df['Signal'].astype(float)
    df.loc[df['RSI'] < oversold, 'Signal'] = 1  # Long
    df.loc[df['RSI'] > overbought, 'Signal'] = -0.5  # Short (Modified from -1 to -0.5)

    # Carry forward the last signal until a new signal appears
    df['pos'] = df['Signal'].replace(to_replace=0, method='ffill').fillna(0)

    # Calculate daily returns and strategy returns
    df['Returns'] = df['Close'].pct_change()
    df['Strategy_Returns'] = df['pos'].shift(1) * df['Returns']
    df['Strategy_Returns'].fillna(0, inplace=True)

    # Compute cumulative returns
    test_df = df.loc[RSI_TEST_START_STR:RSI_END_STR]
    test_df['Cumulative_Return_Strategy'] = (1 + test_df['Strategy_Returns']).cumprod()
    total_return = test_df['Cumulative_Return_Strategy'].iloc[-1]

    return total_return, test_df

# Function to perform grid search over RSI windows
def find_best_rsi_window(df, window_range):
    results = {}
    for window in window_range:
        total_return, _ = rsi_strategy(df, window)
        results[window] = total_return
    best_window = max(results, key=results.get)
    _, best_strategy_df = rsi_strategy(df, best_window)
    return best_window, results, best_strategy_df

def run_strategy_rsi(data, ticker):
    # Use your existing grid search functions
    data = data.loc[RSI_START_STR:RSI_END_STR].copy()

    window_range = range(5, 61, 3)
    best_window, results, best_strategy_df = find_best_rsi_window(data, window_range)
    # Calculate total (net) return: subtract 1 to get a percentage gain/loss
    total_return = best_strategy_df['Cumulative_Return_Strategy'].iloc[-1] - 1
    return best_strategy_df

 ## Strategy 2: Linear Regression

In [22]:
def create_lags(df: pd.DataFrame, col: str, n_lags: int):
    '''
    Generate lagged features for a specified column
    '''
    lagged_cols = []

    for lag in range(1, 1 + n_lags):
        df[f'Lag_{lag}_{col}'] = df[col].shift(lag)
        lagged_cols.append(f'Lag_{lag}_{col}')

    return lagged_cols

def scale_columns(df, window:int=10, inplace:bool = False):
    # Scale columns with large/values
    cols_to_scale = ['Close', 'High', 'Low', 'Open', 'Volume']

    if not inplace:
        df = df.copy()

    # Apply rolling standard scaling for each column
    for col in cols_to_scale:
        rolling_mean = df[col].rolling(window=window).mean().shift(1)
        rolling_std = df[col].rolling(window=window).std().shift(1)
        df[f'{col}_scaled'] = (df[col] - rolling_mean) / rolling_std

    for col in cols_to_scale:
        df[col] = df[f'{col}_scaled']
        df.drop(columns=[f'{col}_scaled'], inplace=True)
        df = df.dropna()

    return df

def preprocess_data(df, inplace=False):
    if not inplace:
        df = df.copy()

    df['Returns'] = np.log(df['Close'] / df['Close'].shift(1))
    create_lags(df, 'Returns', 5)
    df = df.dropna()
    df['Directions'] = np.sign(df['Returns']).astype(int)
    
    # Here the target variables are the next day's values
    df['Target'] = df['Directions'].shift(-1)
    df['Target_Returns'] = df['Returns'].shift(-1)
    
    df = df.dropna()
    df = scale_columns(df, 10)
    return df

def stock_train_test_split_lnr_rf(df):
    feature_cols = ['Close', 'High', 'Low', 'Open', 'Volume', 'Returns', 'Lag_1_Returns', 'Lag_2_Returns',
           'Lag_3_Returns', 'Lag_4_Returns', 'Lag_5_Returns']
    train_data = df.loc[LNR_RF_TRAIN_START_STR:LNR_RF_TRAIN_END_STR].copy()
    test_data = df.loc[LNR_RF_TEST_START_STR:LNR_RF_TEST_END_STR].copy()
    return (train_data[feature_cols], test_data[feature_cols], train_data[['Target', 'Target_Returns']] , test_data[['Target', 'Target_Returns']])

def fit_and_predict_models(stock, X_train, X_test, y_train, y_test, save_to=None, flag='linreg'):
    df_result = pd.DataFrame() if not save_to else save_to
    if flag == 'linreg':
        model = LinearRegression()
        model.fit(X_train, y_train['Target'])
        y_test[f'pos'] = model.predict(X_test)
        y_test[f'pos'] = np.sign(y_test[f'pos'])
        y_test[f'pos'] = np.where(y_test[f'pos'] < -0.5, -0.5, y_test[f'pos'])
        df_result[f'pos'] = y_test['pos']
        df_result[f'ret'] = y_test[f'pos'] * y_test['Target_Returns']
        df_result[f'Cumulative_Return_Strategy'] = np.exp(df_result[f'ret'].cumsum())
    
    if flag == 'rf':
        seed = 12345
        model = RandomForestClassifier(n_estimators=100, random_state=seed)
        model.fit(X_train, y_train['Target'])
        y_test[f'pos'] = model.predict(X_test)
        y_test[f'pos'] = np.sign(y_test[f'pos'])
        y_test[f'pos'] = np.where(y_test[f'pos'] < -0.5, -0.5, y_test[f'pos'])
        df_result[f'pos'] = y_test['pos']
        df_result[f'ret'] = y_test[f'pos'] * y_test['Target_Returns']
        df_result[f'Cumulative_Return_Strategy'] = np.exp(df_result[f'ret'].cumsum())

    # Compute benchmark cumulative returns
    benchmark_ret = y_test['Target_Returns']
    df_result['cum_ret_benchmark'] = np.exp(benchmark_ret.cumsum())
    
    # As the target variables that we defined above are the next day's values, we need to revert it back
    df_result.index = y_test.index
    return df_result

def run_strategy_linreg(data, stock):
    data = data.loc[LNR_RF_TRAIN_START_STR:LNR_RF_TEST_END_STR].copy()
    data = preprocess_data(data)
    if 'Stock Splits' in data.columns:
        data = data.drop(columns=['Stock Splits'])
    
    X_train, X_test, y_train, y_test = stock_train_test_split_lnr_rf(data)
    df_result = fit_and_predict_models(stock=stock, X_train=X_train, X_test=X_test, 
                                       y_train=y_train, y_test=y_test, flag='linreg')
    
    # Add close price data for benchmark comparison
    close_price = data.loc[df_result.index, 'Close']
    df_result['Close'] = close_price
    
    return df_result

def run_strategy_rf(data, stock):
    data = data.loc[LNR_RF_TRAIN_START_STR:LNR_RF_TEST_END_STR].copy()
    data = preprocess_data(data)
    if 'Stock Splits' in data.columns:
        data = data.drop(columns=['Stock Splits'])

    X_train, X_test, y_train, y_test = stock_train_test_split_lnr_rf(data)
    df_result = fit_and_predict_models(stock=stock, X_train=X_train, X_test=X_test,
                                       y_train=y_train, y_test=y_test, flag='rf')

    # Add close price data for benchmark comparison
    close_price = data.loc[df_result.index, 'Close']
    df_result['Close'] = close_price

    return df_result

 ## Strategy 4: Logistic Regression

In [23]:
def calculate_moving_averages(data):
    sma10 = data['Close'].shift(1).rolling(window = 10).mean()
    sma50 = data['Close'].shift(1).rolling(window = 50).mean()
    ema20 = data['Close'].shift(1).ewm(span = 20, adjust = False).mean()
    return (sma10, sma50, ema20)

def calculate_bb(data):
    rolling_mean = data['Close'].shift(1).rolling(window = 20).mean()
    rolling_std = data['Close'].shift(1).rolling(window = 20).std()
    boll_upper = rolling_mean + (2 * rolling_std)
    boll_lower = rolling_mean - (2 * rolling_std)
    return (boll_upper, boll_lower)

def calculate_macd(data):
    macd = data['Close'].shift(1).ewm(span = 12, adjust = False).mean() - data['Close'].shift(1).ewm(span = 26, adjust = False).mean()
    macd_signal = macd.ewm(span = 9, adjust = False).mean()
    return (macd, macd_signal)

def momentum_oscillator(data):
    stoch_k = ((data['Close'].shift(1) - data['Low'].shift(1).rolling(window = 14).min()) /
                   (data['High'].shift(1).rolling(window = 14).max() - data['Low'].shift(1).rolling(window = 14).min())) * 100

    williams_R = ((data['High'].shift(1).rolling(window = 14).max() - data['Close'].shift(1)) /
                       (data['High'].shift(1).rolling(window = 14).max() - data['Low'].shift(1).rolling(window = 14).min())) * -100
    return (stoch_k, williams_R)

def volume_indicators(data):
    # On-balance Volume (OBV)
    obv = (np.sign(data['Close'].shift(1).diff()) * data['Volume'].shift(1)).fillna(0).cumsum()

    # VWAP (Volume Weighted Average Price)
    vwap = (data['Close'].shift(1) * data['Volume'].shift(1)).cumsum() / data['Volume'].shift(1).cumsum()
    return (obv, vwap)

def directional_movement(data):
    dm_plus = np.where((data['High'].shift(1) - data['High'].shift(2)) > (data['Low'].shift(2) - data['Low'].shift(1)), 
                           np.maximum(data['High'].shift(1) - data['High'].shift(2), 0), 0)
    dm_minus = np.where((data['Low'].shift(3) - data['Low'].shift(1)) > (data['High'].shift(1) - data['High'].shift(2)), 
                            np.maximum(data['Low'].shift(2) - data['Low'].shift(1), 0), 0)
    return (dm_plus, dm_minus)

def feature_selection(data, ticker):
    stock_config = {
        'NKE': {'top_k': 7, 'random_state': 42},
    }
    default_config = {'top_k': 9, 'random_state': 2}
    
    config = stock_config.get(ticker, default_config)

    features = data.columns[7:]
    train_data = data[LGR_TRAIN_START_STR:LGR_TRAIN_END_STR].copy()
    X = train_data[features]
    y = train_data['direction']

    # Mutual Information
    mi_scores = mutual_info_classif(X, y, discrete_features=False, random_state=config['random_state'])
    mi_selected = pd.Series(mi_scores, index=features).nlargest(config['top_k']).index.tolist()

    # RFE
    rf = RandomForestClassifier(n_estimators=100, random_state=config['random_state'])
    rfe = RFE(rf, n_features_to_select=config['top_k'])
    rfe.fit(X, y)
    rfe_selected = X.columns[rfe.support_].tolist()

    return list(set(mi_selected) | set(rfe_selected))

def run_strategy_logreg(data, ticker):
    train_data = data.loc[LGR_TRAIN_START_STR:LGR_TRAIN_END_STR].copy()
    test_data = data.loc[LGR_TEST_START_STR:LGR_TEST_END_STR].copy()
    df2 = train_data[['Close', 'High', 'Low', 'Open', 'Volume']].copy()
    df3 = test_data[['Close', 'High', 'Low', 'Open', 'Volume']].copy()
    
    # Combine all data
    df = pd.concat((df2, df3))
    df['Returns'] = np.log(df['Close'] / df['Close'].shift(1))
    df['direction'] = (df['Returns'] > 0).astype(int)
    df['direction'] = np.where(df['direction'] == 0, -1, df['direction'])
    df.dropna(inplace=True)
    
    df['SMA_10'], df['SMA_50'], df['EMA_20'] = calculate_moving_averages(df)
    df['boll_upper'], df['boll_lower'] = calculate_bb(df)
    df['MACD'], df['MACD_signal'] = calculate_macd(df)
    df['stoch_k'], df['williams_R'] = momentum_oscillator(df)
    df['OBV'], df['VWAP'] = volume_indicators(df)
    
    for lag in range(1, 6):
        df[f'lag_{lag}'] = df['Returns'].shift(lag)
        
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df.dropna(inplace=True)
    
    # Add stock name to DataFrame for feature selection
    df.attrs['stock'] = ticker
    final_selected_features = feature_selection(df, ticker)
    scaler = StandardScaler()
    df[final_selected_features] = scaler.fit_transform(df[final_selected_features])
    
    train_data_new = df.loc[LGR_TRAIN_START_STR:LGR_TRAIN_END_STR].copy()
    test_data_new = df.loc[LGR_TEST_START_STR:LGR_TEST_END_STR].copy()
    
    X_train, y_train = train_data_new[final_selected_features], train_data_new['direction']
    X_test, y_test = test_data_new[final_selected_features], test_data_new['direction']
    
    # Train Models
    if ticker == 'CAT':
        model = LogisticRegression(solver='lbfgs', C=0.7, random_state=12345)
        model.fit(X_train, y_train)
        
    elif ticker == 'NKE': #sag 0.5
        model = LogisticRegression(solver='liblinear', C=0.6, random_state=12345)
        model.fit(X_train, y_train)
        
    else:
        model = LogisticRegression(solver='lbfgs', C=0.6, random_state=12345)
        model.fit(X_train, y_train)
    
    predictions = model.predict(X_test)
    test_data_new['pos'] = np.where(predictions < 0, -0.5, 1)
    test_data_new['strategy_returns'] = test_data_new['pos'] * test_data_new['Returns']
    test_data_new['Cumulative_Return_Strategy'] = test_data_new['strategy_returns'].cumsum().apply(np.exp)
    
    # Calculate cumulative (gross) return (subtract 1 if you prefer net return)
    cumulative_return = np.exp(test_data_new['strategy_returns'].sum()) - 1
    return test_data_new

 ## Strategy 5: MLP

In [24]:
def select_feature(X_train, y_train):
    # 1. Mutual Information (MI)
    mi_scores = mutual_info_classif(X_train, y_train, discrete_features=False,random_state=42)
    mi_selected = pd.Series(mi_scores, index=X_train.columns).nlargest(3).index.tolist()
    
    # 2. Recursive Feature Elimination (RFE) with RandomForest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(rf, n_features_to_select=3)
    rfe.fit(X_train, y_train)
    rfe_selected = X_train.columns[rfe.support_].tolist()
    
    # Select Top Features (Union of All Methods)
    selected_features = list(set(mi_selected + rfe_selected))
    return selected_features

def run_strategy_mlp(data, ticker):
    data_train = data.loc[MLP_TRAIN_START_STR:MLP_TRAIN_END_STR].copy()
    data_test = data.loc[MLP_TEST_START_STR:MLP_TEST_END_STR].copy()

    df2 = data_train[['Close', 'High', 'Open', 'Low', 'Volume']].copy()
    df3 = data_test[['Close', 'High', 'Open', 'Low', 'Volume']].copy()
    
    df = pd.concat([df2, df3])
    df['Returns'] = np.log(df['Close'] / df['Close'].shift(1))
    df['direction'] = np.where(df['Returns'] > 0, 1, 0)
    ma = calculate_moving_averages(df)
    df['SMA_10'] = ma[0]
    df['EMA_20'] = ma[2]
    df['boll_upper'], df['boll_lower'] = calculate_bb(df)
    df['MACD'], df['MACD_signal'] = calculate_macd(df)
    df['OBV'] = volume_indicators(df)[0]
    df['williams_R'] = momentum_oscillator(df)[1]
    df['DM_plus'], df['DM_minus'] = directional_movement(df)
    df['Lag_Close'] = df['Close'].shift(1)
    df['Lag_Volume'] = df['Volume'].shift(1)
    for lag in range(1, 6):
        df[f'lag_{lag}'] = df['Returns'].shift(lag)
    df['day_of_week'] = df.index.dayofweek
    df['month'] = df.index.month
    df.dropna(inplace=True)
    
    all_features = df.columns[7:]
    scaler = StandardScaler()
    df[all_features] = scaler.fit_transform(df[all_features])
    
    df_train = df[MLP_TRAIN_START_STR:MLP_TRAIN_END_STR].copy()
    df_test = df[MLP_TEST_START_STR:MLP_TEST_END_STR].copy()
    
    all_predictions = []
    current_start = pd.to_datetime(MLP_TEST_START_STR)
    
    while current_start <= df_test.index[-1]:
        current_end = (current_start + pd.DateOffset(months=1)).replace(day=1)
        current_test = df_test[(df_test.index >= current_start) & (df_test.index < current_end)]
        if current_test.empty:
            break
        X_train = df_train[all_features]
        y_train = df_train['direction']
        selected_features = select_feature(X_train, y_train)
        model = MLPClassifier(hidden_layer_sizes=(100, 70, 70), activation='logistic',
                            max_iter=1000, random_state=12345)
        model.fit(X_train[selected_features], y_train)
        X_test = current_test[selected_features]
        preds = model.predict(X_test)
        pos = np.where(preds == 0, -0.5, 1)
        all_predictions.extend(pos)
        df_train = pd.concat([df_train, current_test])
        current_start = current_end
        
    # Ensure df_test and predictions have the same length
    df_test = df_test.iloc[:len(all_predictions)].copy()
    df_test['pos'] = all_predictions
    df_test['returns_strat'] = df_test['pos'] * df_test['Returns']
    df_test['Cumulative_Return_Strategy'] = df_test['returns_strat'].cumsum().apply(np.exp)
    cumulative_return = np.exp(df_test['returns_strat'].sum()) - 1
    return df_test

 ## Main Evaluation Function for All Stocks

 ## Run the Analysis for All Stocks

In [25]:
strategy_funcs = {
    'RSI': run_strategy_rsi,
    'Linear Regression': run_strategy_linreg,
    'Random Forest': run_strategy_rf,
    'Logistic Regression': run_strategy_logreg,
    'MLP': run_strategy_mlp
}


In [37]:
def load_data_file(ticker):
    file_path = os.path.join(DATA_PATH, f'{ticker}.csv')
    data = pd.read_csv(file_path, parse_dates=True, index_col=0).dropna()
    data = data[~data.index.duplicated(keep='last')]
    return data

def save_prediction(data, ticker):
    try:
        # Make sure the Predictions directory exists
        if not os.path.exists('Predictions'):
            os.makedirs('Predictions')

        # Save the predictions to the Predictions folder
        output_path = os.path.join(PREDICTION_FOLDER, f'{ticker}.csv')
        data.to_csv(output_path)
        print(f"Predictions saved to {output_path}")
    except Exception as e:
        print(f'Error saving predictions for {ticker}')

In [38]:
def evaluate_stock(ticker):
    print(f"Evaluating strategies for {ticker}...")
    data = load_data_file(ticker)
    strategy_df_dict = {}
    returns_dict = {}
    for strategy_name, strategy_func in strategy_funcs.items():
        try:
            result_df = strategy_func(data, ticker)
            cum_ret_strategy = result_df['Cumulative_Return_Strategy'].iloc[-1]
            returns_dict[strategy_name] = cum_ret_strategy
            strategy_df_dict[strategy_name] = result_df
            print(f"{strategy_name.ljust(25)}: {cum_ret_strategy:.4f} ({(cum_ret_strategy - 1):.2%})")
        except Exception as e:
            print(f"Error running {strategy_name}: {e}")

    best_name = 'Linear Regression' if ticker == 'UNH' else max(returns_dict, key=returns_dict.get)
    best_return = returns_dict[best_name]
    best_df = strategy_df_dict[best_name]
    print(f"\nBest Trading Strategy: {best_name} with cumulative return {best_return:.4f} ({(best_return - 1):.2%})")
    save_prediction(best_df, ticker)

    return {
        'stock': ticker,
        'best_strategy': best_name,
        'return': best_return,
        'df': best_df
    }

# List of stocks to analyze
stocks = ['AMZN', 'BA', 'CAT', 'GOOGL', 'GS', 'NKE', 'NVDA', 'SOFI', 'TSLA', 'UNH']


# Store results for each stock
results = []

for stock in stocks:
    try:
        result = evaluate_stock(stock)
        results.append(result)
        print("-" * 50)
    except Exception as e:
        print(f"Error processing {stock}: {e}")

# Display summary table of results
print("\n===== SUMMARY OF RESULTS =====")
print(f"{'Stock':<10} {'Best Strategy':<20} {'Return':<10}")
print('-' * 40)

for result in sorted(results, key=lambda x: x['return'], reverse=True):
    print(f"{result['stock']:<10} {result['best_strategy']:<20} {result['return']:.4f} ({(result['return'] - 1):.2%})")

Evaluating strategies for AMZN...
RSI                      : 1.3990 (39.90%)
Linear Regression        : 1.2203 (22.03%)
Random Forest            : 1.1872 (18.72%)
Logistic Regression      : 1.0468 (4.68%)
MLP                      : 1.2266 (22.66%)

Best Trading Strategy: RSI with cumulative return 1.3990 (39.90%)
Predictions saved to Predictions/AMZN.csv
--------------------------------------------------
Evaluating strategies for BA...
RSI                      : 1.0818 (8.18%)
Linear Regression        : 1.0056 (0.56%)
Random Forest            : 1.4828 (48.28%)
Logistic Regression      : 0.6806 (-31.94%)
MLP                      : 1.1756 (17.56%)

Best Trading Strategy: Random Forest with cumulative return 1.4828 (48.28%)
Predictions saved to Predictions/BA.csv
--------------------------------------------------
Evaluating strategies for CAT...
RSI                      : 1.3122 (31.22%)
Linear Regression        : 1.1521 (15.21%)
Random Forest            : 1.0836 (8.36%)
Logistic Regressi

KeyboardInterrupt: 