# Multivariate LSTM for Predicting EPS (Earnings per Share) over Company Fundamentals

- In this problem, we will focus on predicting Earnings Per Share (EPS) by jointly modeling historical fundamentals where fundamentals for multiple companies in are stored the "fundamentals.csv" file for each year.
- We will try different number of latent dimensions of LSTM: [5, 10, 30] as well as learning rate and number of epochs.
- We will perform hyperparemeter tuning based on the regression evaluation metric, Mean Absolute Percentage Error (MAPE).

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
from keras.models import Sequential
from keras.layers import Input, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import itertools
import os
from tqdm import tqdm

In [2]:
data_directory = r"C:\Users\sb013698\Desktop\github\Machine Learning in Finance\Datasets"

In [3]:
# Load dataset
data = pd.read_csv(os.path.join(data_directory, 'fundamentals.csv'))
print(data.shape)

(1781, 79)


In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Ticker Symbol,Period Ending,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash Ratio,...,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,For Year,Earnings Per Share,Estimated Shares Outstanding
0,0,AAL,2012-12-31,3068000000.0,-222000000.0,-1961000000.0,23.0,-1888000000.0,4695000000.0,53.0,...,7072000000.0,9011000000.0,-7987000000.0,24891000000.0,16904000000.0,24855000000.0,-367000000.0,2012.0,-5.6,335000000.0
1,1,AAL,2013-12-31,4975000000.0,-93000000.0,-2723000000.0,67.0,-3114000000.0,10592000000.0,75.0,...,14323000000.0,13806000000.0,-2731000000.0,45009000000.0,42278000000.0,26743000000.0,0.0,2013.0,-11.25,163022200.0
2,2,AAL,2014-12-31,4668000000.0,-160000000.0,-150000000.0,143.0,-5311000000.0,15135000000.0,60.0,...,11750000000.0,13404000000.0,2021000000.0,41204000000.0,43225000000.0,42650000000.0,0.0,2014.0,4.02,716915400.0
3,3,AAL,2015-12-31,5102000000.0,352000000.0,-708000000.0,135.0,-6151000000.0,11591000000.0,51.0,...,9985000000.0,13605000000.0,5635000000.0,42780000000.0,48415000000.0,40990000000.0,0.0,2015.0,11.39,668129900.0
4,4,AAP,2012-12-29,2409453000.0,-89482000.0,600000.0,32.0,-271182000.0,520215000.0,23.0,...,3184200000.0,2559638000.0,1210694000.0,3403120000.0,4613814000.0,6205003000.0,-27095000.0,2012.0,5.29,73283550.0


In [5]:
# Remove missing rows
clean_df = data.dropna()

# Compute the correlation matrix
correlation_with_eps = clean_df.select_dtypes(include=[np.number]).corr()["Earnings Per Share"]

# Filter the columns that have an absolute correlation greater than 10%
filtered_columns = correlation_with_eps[correlation_with_eps.abs() > 0.1]
clean_df = clean_df[filtered_columns.index.to_list()]
print(clean_df.shape)

(1299, 31)


In [6]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1299 entries, 0 to 1779
Data columns (total 31 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   Accounts Payable                              1299 non-null   float64
 1   Cost of Revenue                               1299 non-null   float64
 2   Depreciation                                  1299 non-null   float64
 3   Earnings Before Interest and Tax              1299 non-null   float64
 4   Earnings Before Tax                           1299 non-null   float64
 5   Gross Profit                                  1299 non-null   float64
 6   Income Tax                                    1299 non-null   float64
 7   Investments                                   1299 non-null   float64
 8   Liabilities                                   1299 non-null   float64
 9   Long-Term Investments                         1299 non-null   float6

In [7]:
# Normalize data
new_df = clean_df.copy()
# Specify the column to be excluded from scaling
column_not_scaled = 'Earnings Per Share'
# Select only the numeric columns (exclude any non-numeric ones)
numeric_columns = new_df.select_dtypes(include=[np.number]).columns.tolist()
# Exclude the target column ('Earnings Per Share') from scaling
columns_to_scale = [col for col in numeric_columns if col != column_not_scaled]
# Extract the subset of the numeric columns to scale
subset_to_scale = new_df[columns_to_scale]
# Initialize and fit the scaler on the subset
scaler = StandardScaler()
scaled_subset = scaler.fit_transform(subset_to_scale)
# Replace the original values with scaled values in the DataFrame
new_df[columns_to_scale] = scaled_subset

# Define features and target & re-order columns
# Define the target column
target = ["Earnings Per Share"]
# Get the column names as a list
features = [col for col in new_df.columns if col not in target]
# Combine target and features into new column order
# Make sure the target is the first column
new_column_order = target + features
new_df = new_df[new_column_order]
print(new_df.shape)

(1299, 31)


In [8]:
new_df.head()

Unnamed: 0,Earnings Per Share,Accounts Payable,Cost of Revenue,Depreciation,Earnings Before Interest and Tax,Earnings Before Tax,Gross Profit,Income Tax,Investments,Liabilities,...,Other Equity,Other Operating Items,Pre-Tax Margin,Profit Margin,Retained Earnings,Sale and Purchase of Stock,Total Current Assets,Total Current Liabilities,Total Revenue,Treasury Stock
0,-5.6,-0.086035,-0.099624,-0.065782,-0.714485,-0.793056,0.489305,-0.637361,0.182437,0.312047,...,-0.843813,0.091423,-0.315517,-0.274099,-0.67803,0.290847,-0.070032,0.336829,0.078146,0.251187
1,-11.25,0.169587,-0.084166,-0.057914,-0.634982,-0.749141,0.586905,-0.522659,-0.342454,-0.323817,...,-0.483568,0.095052,-0.406789,-0.333069,-0.745522,0.290847,0.430957,0.815652,0.120217,0.274049
2,4.02,0.128436,0.052609,0.075425,0.246706,0.144395,1.393538,-0.174953,0.709446,-1.034223,...,-1.443842,0.295535,-0.406789,-0.333069,-0.64491,-0.051547,0.253182,0.775508,0.474674,0.274049
3,11.39,0.186611,-0.081877,0.135469,0.473835,0.377059,1.597871,-1.884679,0.230796,-0.681265,...,-1.509583,0.326832,-0.269881,0.37456,-0.375093,-0.960909,0.131234,0.79558,0.437684,0.274049
4,5.29,-0.174309,-0.319369,-0.401802,-0.312756,-0.284464,-0.313901,-0.223095,0.074423,0.270125,...,0.289618,-0.291852,-0.315517,-0.392038,-0.30352,0.284794,-0.338649,-0.307397,-0.337434,0.272361


In [9]:
# Create sequences
sequences = list()
targets = list()

# Define a sequence length
sequence_length = 30

for i in range(len(new_df) - sequence_length):
    
    seq_features = new_df.iloc[i:(i+sequence_length)].values
    seq_target = new_df.iloc[(i+sequence_length), 0]
    
    sequences.append(seq_features)
    targets.append(seq_target)

# Convert sequences and targets to numpy arrays
X = np.array(sequences, dtype=np.float32)
y = np.array(targets, dtype=np.float32)

print(f"Sequence shape: {X.shape}")
print(f"Target shape: {y.shape}")

Sequence shape: (1269, 30, 31)
Target shape: (1269,)


In [10]:
# Create function to build LSTM models with specific latent dimensions
def build_model(latent_dim, input_shape, lr):
    model = Sequential()
    model.add(
        LSTM(
            units=latent_dim, 
            input_shape=input_shape, 
            activation='tanh', 
            return_sequences=False,
        )
    )
    model.add(Dense(32, activation='tanh'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='linear'))  # Predict EPS for the next time step
    model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
    return model

### Hyperparameter Tuning

In [12]:
# Split the data into training, validation, and test sets
train_size = 0.8
val_size = 0.1
test_size = 0.1

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=(val_size+test_size), random_state=42,
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=(test_size/(val_size+test_size)), random_state=42,
)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test shape: {X_test.shape}")

Train shape: (1015, 30, 31)
Validation shape: (127, 30, 31)
Test shape: (127, 30, 31)


In [13]:
# Create scenarios for hyperparameter tuning
latent_dimensions = [5, 10, 30]
lr_list = [1e-4, 5e-4, 1e-3]
epoch_list = [50, 100, 150, 200]

combinations = list(itertools.product(
    latent_dimensions, lr_list, epoch_list,
))

print(f"Number of combinations: {len(combinations)}")
print(combinations[:5])

Number of combinations: 36
[(5, 0.0001, 50), (5, 0.0001, 100), (5, 0.0001, 150), (5, 0.0001, 200), (5, 0.0005, 50)]


In [14]:
# Check if the results file already exists
results_file = os.path.join(data_directory, "hp_results_eps.csv")

# Initialize an empty dataframe if the file doesn't exist
if os.path.exists(results_file):
    # Load existing results if file exists
    results_df = pd.read_csv(results_file)
else:
    # Create an empty df to store results
    results_df = pd.DataFrame(columns=["Scenario", "Latent Dimension", "Learning Rate", "Number of Epochs", "MAPE"])

scenario_id = 1

for combo in tqdm(combinations):

    # Define parameter values from combinations
    latent_dim, lr, epoch = combo

    # Define and train the LSTM model using given parameter values
    model = build_model(latent_dim, (X_train.shape[1], X_train.shape[2]), lr)  # input_shape=(timesteps, features)
    model.fit(X_train, y_train, epochs=epoch, batch_size=64, verbose=0)

    # Make predictions over the validation set and evaluate model's performance using MAPE
    preds = model.predict(X_val, verbose=0)
    mape = mean_absolute_percentage_error(preds, y_val)

    # Store sceanario results in a dictionary
    result = {
        "Scenario": f"S{scenario_id}",
        "Latent Dimension": latent_dim,
        "Learning Rate": lr,
        "Number of Epochs": epoch,
        "MAPE": mape,
    }

    scenario_id += 1

    # Convert the result to a DataFrame
    scenario_result = pd.DataFrame([result])
    
    # Drop columns that are completely empty or contain all NaN values
    scenario_result = scenario_result.dropna(axis=1, how='all')
    
    # Concatenate the new result with the existing DataFrame
    results_df = pd.concat([results_df, scenario_result], ignore_index=True)
    
    # Save results to CSV after each scenario
    results_df.to_csv(results_file, index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 36/36 [31:14<00:00, 52.06s/it]


### Model Evaluation

In [15]:
# Obtain best hp values
hp_results = pd.read_csv(results_file)
hp_results.sort_values(by="MAPE", ascending=True)[:10]

Unnamed: 0,Scenario,Latent Dimension,Learning Rate,Number of Epochs,MAPE
12,S13,10,0.0001,50,0.699085
2,S3,5,0.0001,150,0.749119
13,S14,10,0.0001,100,0.80994
1,S2,5,0.0001,100,0.816036
24,S25,30,0.0001,50,0.866264
14,S15,10,0.0001,150,0.909409
6,S7,5,0.0005,150,1.050857
15,S16,10,0.0001,200,1.082592
0,S1,5,0.0001,50,1.104181
4,S5,5,0.0005,50,1.152777


In [16]:
# Evaluate the model's performance on the test set using optimal hp values
# Split the data into training and test sets
# This time, use 90% and 10% of data for training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42,
)

# Use optimal HP settings to evaluate the model's overall performance
best_model = build_model(
    latent_dim=10, 
    input_shape=(X_train.shape[1], X_train.shape[2]), 
    lr=1e-4,
)
best_model.fit(X_train, y_train, epochs=50, batch_size=64, verbose=0)

# Make predictions over the test set
preds = best_model.predict(X_test, verbose=1)
test_mape = mean_absolute_percentage_error(preds, y_test)

print(f"Test MAPE: {test_mape:.3f}")

Test MAPE: 0.890


# END