In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score

In [18]:
# Load the dataset
stocks_data = pd.read_csv("../data/final_merged_data.csv", parse_dates=["Date"])
stocks_data = stocks_data.iloc[:, 1:]

# Data Preparation

In [20]:
# get names of feature columns
features = stocks_data.drop(columns=['Date', 'Ticker', 'Risk_Premium', 'Risk_Premium_lag_1']).columns
target = 'Risk_Premium'

# impute missing values
imputer = SimpleImputer(strategy='mean')
stocks_data[features] = imputer.fit_transform(stocks_data[features])

# ticker names
tickers = stocks_data['Ticker'].unique()

In [21]:
def create_sequences(X, y, n_steps=12):
    """
    Create sliding window sequences for RNN models.
    
    :param X: Feature DataFrame
    :param y: Target series
    :param n_steps: Number of timesteps for each input sequence
    :return: Arrays of input sequences (X_seq) and corresponding targets (y_seq)
    """
    X_seq, y_seq = [], []
    for i in range(len(X) - n_steps):
        X_seq.append(X.iloc[i:i + n_steps].values)  # Get sequence of features
        y_seq.append(y.iloc[i + n_steps])          # Get corresponding target value
    
    return np.array(X_seq), np.array(y_seq)

# Dictionaries to store sequences and scalers per ticker
X_tickers = {}
y_tickers = {}
scalers = {}

# Loop through each ticker first
for ticker in tickers:
    # Select rows for the current ticker
    ticker_data = stocks_data[stocks_data['Ticker'] == ticker]

    # Separate features (X) and target variable (y)
    X_df = ticker_data[features]  # Select feature columns
    y_df = ticker_data[['Risk_Premium']]  # Select target column

    # Create separate scalers for X and y
    scaler_x = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Fit & transform the features and target separately
    X_scaled = scaler_x.fit_transform(X_df)
    y_scaled = scaler_y.fit_transform(y_df)

    # Convert back to DataFrame for sequence creation
    X_scaled_df = pd.DataFrame(X_scaled, columns=X_df.columns, index=X_df.index)
    y_scaled_df = pd.DataFrame(y_scaled, columns=y_df.columns, index=y_df.index)

    # Create sequences
    X_seq, y_seq = create_sequences(X_scaled_df, y_scaled_df)

    # --- Split the sequences into training and testing sets ---
    # Ensure there are at least 160 training and 100 test samples
    X_train = X_seq[:-100]
    y_train = y_seq[:-100]
    X_test = X_seq[-100:]
    y_test = y_seq[-100:]

    # Store results in dictionaries
    X_tickers[ticker] = {'train': X_train, 'test': X_test}
    y_tickers[ticker] = {'train': y_train, 'test': y_test}
    
    # Store scalers for later inverse transformation
    scalers[ticker] = {'X_scaler': scaler_x, 'y_scaler': scaler_y}


# Model Training & Evaluation per Ticker

In [22]:
def create_lstm_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))  # Explicit Input layer
    model.add(LSTM(50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))  # Single output for risk premium
    model.compile(optimizer="adam", loss="mean_squared_error")
    return model

In [23]:
# Dictionary to store evaluation results for each ticker
results = {}

for ticker in X_tickers:
    X_train = X_tickers[ticker]['train']  # Shape: (num_train_samples, sequence_length, num_features)
    y_train = y_tickers[ticker]['train']
    X_test = X_tickers[ticker]['test']    # Shape: (100, sequence_length, num_features)
    y_test = y_tickers[ticker]['test']
    
    # Get scalers for the ticker
    scaler_y = scalers[ticker]['y_scaler']

    # Define the input shape based on the training set
    input_shape = X_train.shape[1:]  # (sequence_length, num_features)
    
    # Create and train the model
    model = create_lstm_model(input_shape)
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)
    
    # Predict on the test set
    y_pred_scaled = model.predict(X_test).flatten().reshape(-1, 1)

    # Inverse transform predictions and true values
    y_pred = scaler_y.inverse_transform(y_pred_scaled).flatten()
    y_test_original = scaler_y.inverse_transform(y_test).flatten()

    # Compute the R-squared value
    r2 = r2_score(y_test_original, y_pred)
    
    # Store results
    results[ticker] = {
        'r_squared': r2,
        'y_test': y_test_original,
        'y_pred': y_pred
    }
    
    print(f"Ticker: {ticker} | Test Samples: {X_test.shape[0]} | R-squared: {r2:.4f}")


[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 95ms/step
Ticker: AAPL | Test Samples: 100 | R-squared: -1.1284
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
Ticker: ACIW | Test Samples: 100 | R-squared: -0.1318
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step
Ticker: ADBE | Test Samples: 100 | R-squared: -1.4246
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Ticker: ADI | Test Samples: 100 | R-squared: -0.7136
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Ticker: ADP | Test Samples: 100 | R-squared: -0.2394
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step
Ticker: ADSK | Test Samples: 100 | R-squared: -0.4563
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Ticker: ADTN | Test Samples: 100 | R-squared: -1.2356
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Ticker: AEHR | Test Samples: 100 | R-squared: -0

In [None]:
# create a summary DataFrame of R-squared values
results_df = pd.DataFrame({
    'Ticker': list(results.keys()),
    'R_squared': [results[ticker]['r_squared'] for ticker in results],
    'y_test': [results[ticker]['y_test'] for ticker in results],
    'y_pred': [results[ticker]['y_pred'] for ticker in results]
})

In [29]:
# Save the results to a CSV file
results_df.to_csv("../data/model_results/lstm_results.csv", index=False)

# Portfolio Construction

In [28]:
# Calculate cumulative returns for a portfolio comprising top 10% stocks each month
fixed_investment = 1000  # Amount to invest each month

# Create a DataFrame for cumulative returns
investment_results = []  # To store cumulative returns
cumulative_return = 0.0  # Start with zero cumulative return

# Loop through each month in the test period
for month in range(100):
    # Select the stocks for the current month
    current_month_stocks = results_df.copy()
    current_month_stocks['predicted_return'] = current_month_stocks['Ticker'].apply(lambda ticker: results[ticker]['y_pred'][month])
    current_month_stocks['actual_return'] = current_month_stocks['Ticker'].apply(lambda ticker: results[ticker]['y_test'][month])

    if not current_month_stocks.empty:
        # Select the top 10% stocks based on predicted returns
        top_10_percent_threshold = current_month_stocks['predicted_return'].quantile(0.90)
        current_top_stocks = current_month_stocks[current_month_stocks['predicted_return'] >= top_10_percent_threshold]

        # Use the risk premium as the monthly return
        monthly_returns = current_top_stocks['actual_return'].values
        
        # Calculate the average risk premium of the selected top stocks
        if len(monthly_returns) > 0:
            average_monthly_return = monthly_returns.mean()  # Average risk premium
            
            # Calculate total return from the fixed investment
            cumulative_return += average_monthly_return * fixed_investment

            # Store the result for this month
            investment_results.append({
                'month': month,
                'cumulative_return': cumulative_return
            })

# Convert results to DataFrame
investment_df = pd.DataFrame(investment_results)

# Display cumulative returns
print(investment_df)

    month  cumulative_return
0       0          53.855206
1       1          85.219464
2       2         195.984762
3       3         288.340293
4       4         297.375297
..    ...                ...
95     95        1427.536883
96     96        1399.320129
97     97        1502.325265
98     98        1437.949355
99     99        1447.096736

[100 rows x 2 columns]
