# Description

This notebook creates a deep neural network model (MLP) to predict future returns based on fundamental factors.

I got the data from the SimFin free tier. I believe they give about 5 years of historical data for US
stocks and it is not quite recent

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
# Import the main functionality from the SimFin Python API.
import simfin as sf
# Import names used for easy access to SimFin's data-columns.
from simfin.names import *

In [2]:
# SimFin data-directory.
sf.set_data_dir('~/.simfin/data/')
# SimFin load API key or use free data.
sf.load_api_key(path='~/.simfin/api-key.txt', default_key='free')

In [3]:
# Seaborn set plotting style.
sns.set_style("whitegrid")

## Load Data

In [4]:
%%time
# Data for USA.
market = 'us'
# TTM Income Statements.
df_income_ttm = sf.load_income(variant='ttm', market=market)
df_income_qrt = sf.load_income(variant='quarterly', market=market)
# Quarterly Income Statements.
df_balance_ttm = sf.load_balance(variant='ttm', market=market)
df_balance_qrt = sf.load_balance(variant='quarterly', market=market)
# Quarterly Balance Sheets.
df_cashflow_ttm = sf.load_cashflow(variant='ttm', market=market)
df_cashflow_qrt = sf.load_cashflow(variant='quarterly', market=market)
# Daily Share-Prices.
df_prices = sf.load_shareprices(variant='daily', market=market)

Dataset "us-income-ttm" on disk (7 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-income-quarterly" on disk (6 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-balance-ttm" on disk (7 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-balance-quarterly" on disk (6 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-cashflow-ttm" on disk (7 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-cashflow-quarterly" on disk (5 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
Dataset "us-shareprices-daily" on disk (7 days old).
- Loading from disk ... 

  df = pd.read_csv(path, sep=';', header=0,


Done!
CPU times: user 11.2 s, sys: 797 ms, total: 12 s
Wall time: 12.3 s


### Filter tickers
First filter out all tickers that are not in all datasets

Next lets get rid of all tickers that have a price in the datasets below some small value.
It might be better to limit by market cap but this might work well enough

In [5]:
df_prices = df_prices.rename(columns={ADJ_CLOSE: 'AdjClose'})

In [7]:
df_prices = df_prices.query('AdjClose > 0.25')
df_prices.shape

(5208144, 9)

In [13]:

p_ticks = set(df_prices.index.get_level_values(0))
i1_ticks = set(df_income_ttm.index.get_level_values(0))
i2_ticks = set(df_income_qrt.index.get_level_values(0))
b1_ticks = set(df_balance_ttm.index.get_level_values(0))
b2_ticks = set(df_balance_qrt.index.get_level_values(0))
c1_ticks = set(df_cashflow_ttm.index.get_level_values(0))
c2_ticks = set(df_cashflow_qrt.index.get_level_values(0))

all_ticks = p_ticks.intersection(i1_ticks, i2_ticks, b1_ticks, b2_ticks, c1_ticks, c2_ticks)
tickers = list(all_ticks)
len(tickers)

3640

In [14]:
df_income_ttm = df_income_ttm.loc[tickers].copy()
df_income_qrt = df_income_qrt.loc[tickers].copy()
df_balance_ttm = df_balance_ttm.loc[tickers].copy()
df_balance_qrt = df_balance_qrt.loc[tickers].copy()
df_cashflow_ttm = df_cashflow_ttm.loc[tickers].copy()
df_cashflow_qrt = df_cashflow_qrt.loc[tickers].copy()
df_prices = df_prices.loc[tickers].copy()

## Calculate Signals

In [15]:
df_fin_signals = sf.fin_signals(df_prices=df_prices,
                                df_income_ttm=df_income_ttm,
                                df_balance_ttm=df_balance_ttm,
                                df_cashflow_ttm=df_cashflow_ttm,
                                fill_method='ffill')


df_val_signals = sf.val_signals(df_prices=df_prices,
                                df_income_ttm=df_income_ttm,
                                df_balance_ttm=df_balance_ttm,
                                df_cashflow_ttm=df_cashflow_ttm)

df_growth_signals = \
    sf.growth_signals(df_prices=df_prices,
                      df_income_ttm=df_income_ttm,
                      df_income_qrt=df_income_qrt,
                      df_balance_ttm=df_balance_ttm,
                      df_balance_qrt=df_balance_qrt,
                      df_cashflow_ttm=df_cashflow_ttm,
                      df_cashflow_qrt=df_cashflow_qrt)



  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

## Create DF of factors we want

In [16]:
df_factors = pd.DataFrame(index=df_prices.index)
df_factors['AdjClose'] = df_prices['AdjClose']
df_factors["Return 3"] = sf.rel_change(df=df_prices['AdjClose'], freq='bdays', bdays=63, future=True)
df_factors["Return 6"] = sf.rel_change(df=df_prices['AdjClose'], freq='bdays', bdays=126, future=True)
df_factors["Return 12"] = sf.rel_change(df=df_prices['AdjClose'], freq='bdays', bdays=252, future=True)
df_factors.describe()

Unnamed: 0,AdjClose,Return 3,Return 6,Return 12
count,3715422.0,3487405.0,3261052.0,2813830.0
mean,237.8708,0.03186336,0.06737002,0.164715
std,8041.123,0.4408853,0.670864,1.229736
min,0.26,-0.999,-0.9994836,-0.9989633
25%,9.24,-0.147291,-0.2133484,-0.307919
50%,23.07,0.001013171,0.001128987,-0.0004062358
75%,58.47,0.1431353,0.2121212,0.3361234
max,1587600.0,53.51724,87.65306,124.6889


In [17]:
df_growth_wins = sf.winsorize(df=df_growth_signals, quantile=0.01)
df_fin_wins = sf.winsorize(df=df_fin_signals, quantile=0.01)
df_value_wins = sf.winsorize(df=df_val_signals, quantile=0.01)

In [18]:
df_factors[EARNINGS_GROWTH_QOQ] =  df_growth_wins[EARNINGS_GROWTH_QOQ]
df_factors[SALES_GROWTH_QOQ] =  df_growth_wins[SALES_GROWTH_QOQ]

df_factors[CURRENT_RATIO] =  df_fin_wins[CURRENT_RATIO]
df_factors[DEBT_RATIO] =  df_fin_wins[DEBT_RATIO]
df_factors[GROSS_PROFIT_MARGIN] =  df_fin_wins[GROSS_PROFIT_MARGIN]
df_factors[NET_PROFIT_MARGIN] =  df_fin_wins[NET_PROFIT_MARGIN]
df_factors[BUYBACK_RATIO] =  df_fin_wins[BUYBACK_RATIO]
df_factors[ROA] =  df_fin_wins[ROA]
df_factors[ROE] =  df_fin_wins[ROE]

df_factors[MARKET_CAP] =  df_value_wins[MARKET_CAP]
df_factors[PE] =  df_value_wins[PE]
df_factors[PFCF] =  df_value_wins[PFCF]
df_factors[PSALES] =  df_value_wins[PSALES]
df_factors[PRICE_BOOK] =  df_value_wins[PRICE_BOOK]

## Massage the data to be more ML friendly

In [22]:
df_factors.describe()

Unnamed: 0,AdjClose,Return 3,Return 6,Return 12,Earnings Growth QOQ,Sales Growth QOQ,Current Ratio,Debt Ratio,Gross Profit Margin,Net Profit Margin,Share Buyback / FCF,Return on Assets,Return on Equity,Market-Cap,P/E,P/FCF,P/Sales,Price to Book Value
count,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0,1726555.0
mean,56.32307,0.02457287,0.05158134,0.1236829,-0.1202936,0.04111684,2.508991,0.332051,0.4268286,-0.168188,0.3165777,-0.01815578,-0.01694145,21.37117,10.94936,12.18271,4.434769,4.431461
std,65.05443,0.2428971,0.3653501,0.5750646,1.801426,0.2164378,2.070461,0.2154548,0.2377891,0.7010802,1.055455,0.1602342,0.4583977,2.074324,46.02368,41.88652,6.110786,6.006821
min,2.34,-0.4290585,-0.5610708,-0.722185,-4.808897,-0.4214709,0.4998063,0.01721741,0.04706318,-3.047107,-2.157863,-0.523523,-1.431009,17.23895,-115.0737,-94.74474,0.1703045,-4.30367
25%,12.39,-0.1286549,-0.190003,-0.2545829,-0.7058116,-0.05564435,1.185285,0.1594612,0.2334284,-0.07734627,0.0,-0.04542085,-0.1071314,19.92572,-5.135855,-3.243053,0.8027585,1.286382
50%,31.27,0.01164144,0.01541522,0.02855713,-0.09585704,0.02552617,1.818491,0.3159765,0.394937,0.03706226,0.04191196,0.0254122,0.06906786,21.4876,12.56294,10.91749,1.971963,2.530204
75%,72.89,0.154446,0.2327434,0.3678213,0.3427562,0.1138775,2.961384,0.4673797,0.5997297,0.1059749,0.5106196,0.06894598,0.1815572,22.89672,28.57545,25.42574,5.06745,5.237665
max,259.67,0.606187,0.9830666,1.721371,4.941176,0.6425321,9.126078,0.8238911,0.9069016,0.327414,3.591735,0.1891279,0.8641251,25.03662,125.042,127.9845,25.84951,24.58208


In [19]:
df_factors = df_factors.dropna()

In [20]:
df_factors[MARKET_CAP] = np.log(df_factors[MARKET_CAP])

In [21]:
df_factors = sf.winsorize(df=df_factors, quantile=0.04)

In [49]:
# at some point we lost a bunch of tickers, not sure where but for now lets get a new set
tickers = list(set(df_factors.index.get_level_values(0)))

In [50]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [51]:
train_tickers, test_tickers = train_test_split(tickers, train_size=0.8, random_state=42)
len(train_tickers), len(test_tickers), len(tickers)

(1932, 483, 2415)

In [55]:
ts = pd.Timestamp('2019-01-02')

In [56]:
ts

Timestamp('2019-01-02 00:00:00')

In [77]:
ts = ts - pd.offsets.BusinessDay(n=20)
ts

Timestamp('2022-07-27 00:00:00')

In [82]:
ts < pd.Timestamp('2022-07-28')

True

## Create the train and test datasets

In [84]:
x_cols = [EARNINGS_GROWTH_QOQ, 
          SALES_GROWTH_QOQ,
         CURRENT_RATIO,
         DEBT_RATIO,
         GROSS_PROFIT_MARGIN,
         NET_PROFIT_MARGIN,
         BUYBACK_RATIO,
         ROA,
         ROE,
         MARKET_CAP,
         PE,
         PFCF,
         PSALES,
         PRICE_BOOK]

y_col = "Return 3"

In [87]:
def create_dataset(df, tickers, x_cols, y_col):
    df_subset = df.loc[tickers].copy()
    df_xs = df_subset[x_cols]
    df_ys = df_subset[y_col]
    return df_xs, df_ys

In [None]:
X_train, y_train = create_dataset(df_factors, train_tickers, x_cols, y_col)
X_test, y_test = create_dataset(df_factors, test_tickers, x_cols, y_col)

## Scale the data

In [94]:
def scale_signal(df):
    """Apply the signal-scaler to the given DataFrame."""

    # Apply the scaler. This returns a numpy array.
    array = signal_scaler.transform(df)
    
    # Create a DataFrame with the correct column-names and index.
    df_scaled = pd.DataFrame(data=array,
                             columns=df.columns,
                             index=df.index)
    return df_scaled

In [93]:
signal_scaler = StandardScaler()
signal_scaler.fit(X_train)

In [95]:
X_train = scale_signal(X_train)
X_test = scale_signal(X_test)

## Build the Neural Network

In [108]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

In [228]:
X_train_ten = torch.tensor(X_train.values, dtype=torch.float32)
y_train_ten = torch.tensor(y_train.values, dtype=torch.float32)
y_train_ten = torch.unsqueeze(y_train_ten, 1)
X_test_ten = torch.tensor(X_test.values, dtype=torch.float32)
y_test_ten = torch.tensor(y_test.values, dtype=torch.float32)
y_test_ten = torch.unsqueeze(y_test_ten, 1)

In [229]:
y_train_ten.shape

torch.Size([1395412, 1])

In [230]:
# Custom Dataset
class MyDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [231]:
train_ds = MyDataset(X_train_ten, y_train_ten)
test_ds = MyDataset(X_test_ten, y_test_ten)

In [232]:
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=512, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=512, shuffle=False)

In [239]:
class MLP(nn.Module):
  '''
    Multilayer Perceptron.
  '''
  def __init__(self):
    super().__init__()
    self.layers = nn.Sequential(
      nn.Linear(14, 32),
      nn.ReLU(),
      nn.Linear(32, 16),
      nn.ReLU(),
      nn.Linear(16, 1)
    )


  def forward(self, x):
    '''Forward pass'''
    return self.layers(x)

In [240]:
torch.manual_seed(42)

<torch._C.Generator at 0x3168b5cf0>

In [241]:
# Initialize the MLP
mlp = MLP()
# Define the loss function and optimizer
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3)

In [242]:
# Run the training loop
for epoch in range(0, 10): # 5 epochs at maximum
    
    # Print epoch
    print(f'Starting epoch {epoch+1}')
    
    # Set current loss value
    current_loss = 0.0
    current_test_loss = 0.0
    
    mlp.train()
    # Iterate over the DataLoader for training data
    for i, data in enumerate(train_loader, 0):
      
        # Get inputs
        inputs, targets = data
      
        # Zero the gradients
        optimizer.zero_grad()
      
        # Perform forward pass
        outputs = mlp(inputs)

        #print(f'out shape {outputs.shape}')
        #print(f'targets shape {targets.shape}')
      
        # Compute loss
        loss = loss_function(outputs, targets)
      
        # Perform backward pass
        loss.backward()
      
        # Perform optimization
        optimizer.step()
      
        # Print statistics
        current_loss += loss.item()

    train_loss = current_loss / i
    print(f'{epoch=}, train loss {train_loss}')

    current_test_loss = 0.0
    mlp.eval()
    for i, data in enumerate(test_loader, 0):
        # Get inputs
        inputs, targets = data

        # Perform forward pass
        outputs = mlp(inputs)

        # Compute loss
        loss = loss_function(outputs, targets)
        current_test_loss += loss.item()

    test_loss = current_test_loss / i
    print(f'{epoch=}, test loss {test_loss}')

    


      
# Process is complete.
print('Training process has finished.')

Starting epoch 1
epoch=0, train loss 0.05737535695827335
epoch=0, test loss 0.05711236883289305
Starting epoch 2
epoch=1, train loss 0.05611503255066522
epoch=1, test loss 0.05745520958772085
Starting epoch 3
epoch=2, train loss 0.05544866596750163
epoch=2, test loss 0.0583243990366004
Starting epoch 4
epoch=3, train loss 0.05504066009723812
epoch=3, test loss 0.05843496419872869
Starting epoch 5
epoch=4, train loss 0.054782804756536393
epoch=4, test loss 0.05878835539665108
Starting epoch 6
epoch=5, train loss 0.05463810332051111
epoch=5, test loss 0.05900845287119733
Starting epoch 7
epoch=6, train loss 0.0544955742058404
epoch=6, test loss 0.0590932720907367
Starting epoch 8
epoch=7, train loss 0.05438000734929645
epoch=7, test loss 0.05926096222245195
Starting epoch 9
epoch=8, train loss 0.05426521712623605
epoch=8, test loss 0.0592813022894606
Starting epoch 10
epoch=9, train loss 0.05418656124160924
epoch=9, test loss 0.05926060552951006
Training process has finished.


# Conclusion
