In [1]:
# Dependencies
import pandas as pd
import datetime
from dateutil.relativedelta import relativedelta

from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
# Define a function to add "n" months to a date
def add_months (start_date, n_months):
    
    # Parse the text string to get the current year and month, then add n_months
    temp_year = int(start_date[0:4])
    temp_month = int(start_date[5:7]) + n_months
    
    # If the 'temp_month' is greater than 12, then figure out how many years need to be added, and reset the month
    if (temp_month > 12):
        temp_year = temp_year + int(temp_month/12)
        temp_month = temp_month % 12
    
    if (temp_month == 0):
        new_date = datetime.datetime (temp_year-1, 12, 31) 
    elif (temp_month == 12):
        new_date = datetime.datetime (temp_year, 12, 31) 
    else:
        new_date = datetime.datetime (temp_year, temp_month + 1, 1) - relativedelta(days=1)
    return new_date.strftime("%Y-%m-%d")

In [3]:
# Connect to the SQLite database
database_name = "data/CompanyData.sqlite"
engine = create_engine(f"sqlite:///{database_name}", echo=False)

In [4]:
# Load the Master file from the database
master_df = pd.read_sql_query("SELECT * FROM MasterData_ML", engine)
master_df.dropna(inplace=True)
master_df.head()

Unnamed: 0,monthend_date,ticker,price,wealth_index,end_wealth_index,total_return,cash,st_debt,lt_debt,equity,...,ebit_ev,name,sector,quantile_total_return,price_1_month_ago,price_3_months_ago,price_12_months_ago,trailing_1_month_return,trailing_3_month_return,trailing_12_month_return
0,2016-04-30,A,40.92,39.693,44.514,12.145718,1931000000.0,80000000.0,1653000000.0,4045000000.0,...,0.041902,Agilent Technologies Inc,Health Care,1,39.85,37.65,41.37,2.685069,8.685259,-1.087745
1,2016-05-31,A,45.89,44.514,43.1432,-3.079481,1931000000.0,80000000.0,1653000000.0,4045000000.0,...,0.037304,Agilent Technologies Inc,Health Care,3,40.92,37.35,41.19,12.14565,22.864793,11.410537
2,2016-06-30,A,44.36,43.1432,46.7904,8.453708,2139000000.0,235000000.0,1654000000.0,4162000000.0,...,0.040637,Agilent Technologies Inc,Health Care,1,45.89,39.85,38.58,-3.33406,11.31744,14.981856
3,2016-07-31,A,48.11,46.7904,45.6913,-2.348986,2139000000.0,235000000.0,1654000000.0,4162000000.0,...,0.037419,Agilent Technologies Inc,Health Care,3,44.36,40.92,40.95,8.453562,17.57087,17.484737
4,2016-08-31,A,46.98,45.6913,45.9121,0.483243,2139000000.0,235000000.0,1654000000.0,4162000000.0,...,0.038333,Agilent Technologies Inc,Health Care,2,48.11,45.89,36.31,-2.348784,2.375245,29.385844


In [5]:
# Print the list of available dates
master_df.groupby('monthend_date').size()

monthend_date
2016-03-31    429
2016-04-30    467
2016-05-31    471
2016-06-30    472
2016-07-31    472
2016-08-31    472
2016-09-30    472
2016-10-31    472
2016-11-30    473
2016-12-31    476
2017-01-31    476
2017-02-28    476
2017-03-31    477
2017-04-30    477
2017-05-31    477
2017-06-30    477
2017-07-31    477
2017-08-31    477
2017-09-30    477
2017-10-31    477
2017-11-30    477
2017-12-31    478
2018-01-31    478
2018-02-28    477
2018-03-31    478
2018-04-30    479
2018-05-31    479
2018-06-30    481
2018-07-31    481
2018-08-31    482
2018-09-30    483
2018-10-31    483
2018-11-30    483
2018-12-31    483
2019-01-31    482
2019-02-28    482
2019-03-31    483
2019-04-30    484
2019-05-31    483
2019-06-30    484
2019-07-31    484
2019-08-31    484
dtype: int64

### Find the list of tickers, split into train & test

In [6]:
# Initialize some items
# "mc" stands for "Monte Carlo"
tickers_df = master_df['ticker'].unique()
mc_results_df = pd.DataFrame()

In [7]:
for mc_run in range (1,31):
    print (f"\nMonte Carlo run number {mc_run}")
    
    # The range of the data
    start_date = '2016-04-30'
    end_date = '2019-08-31'
    
    first_date = start_date
    last_date = add_months (first_date, 23)
    
    while last_date <= end_date:

        # We are going to test 2 years' worth of data, analyze results, then roll forward one month
        now = datetime.datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print (f"\nFirst_date is {first_date} and last_date is {last_date}. Current time is {current_time}.")
            
        
        # Split the tickers into "train" and "test" sets
        tickers_train, tickers_test  = train_test_split(tickers_df)

        # Now, split the master_df into train & test, based on the split of the tickers
        master_train_df = master_df[master_df['ticker'].isin(tickers_train)]
        master_test_df = master_df[master_df['ticker'].isin(tickers_test)]

        # The train dataset has to be one month behind the test dataset, as would happen IRL
        first_date_m1 = add_months(first_date, -1)
        last_date_m1 = add_months(last_date, -1)
        master_train_2yrs_df = master_train_df[first_date_m1 <= master_train_df['monthend_date']]
        master_train_2yrs_df = master_train_2yrs_df[master_train_2yrs_df['monthend_date'] <= last_date_m1]

        master_test_2yrs_df =  master_test_df[first_date <= master_test_df['monthend_date']]
        master_test_2yrs_df = master_test_2yrs_df[master_test_2yrs_df['monthend_date'] <= last_date]

        cols = ['earnings_yield','book_yield','revenue_ev','ebit_ev','net_debt_capital', \
               'trailing_1_month_return','trailing_3_month_return','trailing_12_month_return']
        
        X_master_train = master_train_2yrs_df[cols]
        X_master_test = master_test_2yrs_df[cols]
        y_master_train = master_train_2yrs_df["quantile_total_return"]
        y_master_test = master_test_2yrs_df["quantile_total_return"]

        # I intentionally did not "scale" the data, since it ruins the model
        X_train_scaled = X_master_train.to_numpy()
        X_test_scaled = X_master_test.to_numpy()

        # Step 1: Label-encode data set
        label_encoder = LabelEncoder()
        label_encoder.fit(y_master_train)
        encoded_y_train = label_encoder.transform(y_master_train)
        encoded_y_test = label_encoder.transform(y_master_test)

        # Step 2: Convert encoded labels to one-hot-encoding
        y_train_categorical = to_categorical(encoded_y_train)
        y_test_categorical = to_categorical(encoded_y_test)

        # Create model and add layers
        n_nodes = 100
        model = Sequential()
        model.add(Dense(units=n_nodes, activation='relu', input_dim=8))
        model.add(Dense(units=n_nodes, activation='relu'))
        model.add(Dense(units=n_nodes, activation='relu'))
        model.add(Dense(units=3, activation='softmax'))

        # Compile and fit the model
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])

        model.fit(
            X_train_scaled,
            y_train_categorical,
            epochs=40,
            shuffle=True,
            verbose=0)

        # Quantify the Trained Model
        model_loss, model_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=2)

        # Make Predictions
        encoded_predictions = model.predict_classes(X_test_scaled)
        prediction_labels = label_encoder.inverse_transform(encoded_predictions)
        actual_labels = list(y_master_test)
        model_results_df = pd.DataFrame({'predicted':prediction_labels, 'actual':actual_labels, \
                                         'monthend_date':master_test_2yrs_df['monthend_date'], \
                                         'ticker':master_test_2yrs_df['ticker'], \
                                         'total_return':master_test_2yrs_df['total_return']})

        # Narrow down the model_results_df to the last_date
        last_date_df = model_results_df[model_results_df['monthend_date']==last_date]

        # Calculate the average stock return 
        avg_return = last_date_df['total_return'].mean()

        # Calculate group 1, 2, 3 returns
        g_returns = last_date_df['total_return'].groupby(last_date_df['predicted']).mean()
        g_count = last_date_df['total_return'].groupby(last_date_df['predicted']).count()

        # Since there is a chance that the model will not return any companies in one of the three tertiles, we have to have
        # this code.  If there is no return for a particular tertile, set it equal to the average of all stocks
        # so that the relative return will be zero.
        group_return = [0,0,0,0]
        group_count = [0,0,0,0]

        for i, val in enumerate(g_count):
            group_return[g_returns.index[i]] = g_returns[g_returns.index[i]];
            group_count[g_returns.index[i]] = g_count[g_count.index[i]];

        for i in range (1,4):
            if (group_return[i]==0):
                group_return[i] = avg_return

        # Add the results to the mc_results_df
        Data_row_df = pd.DataFrame({"mc_run": mc_run, "monthend_date": last_date, "avg_return": avg_return,
                                  "g1_return": group_return[1], "g2_return": group_return[2], "g3_return": group_return[3],
                                 "n_stocks": len(last_date_df.index), "g1_count": group_count[1], "g2_count": group_count[2],
                                 "g3_count": group_count[3], "model_loss": model_loss, "model_accuracy": model_accuracy
                                }, index=[0])

        mc_results_df = mc_results_df.append(Data_row_df, ignore_index=True)
        
        # Increment the first_date and last_date
        first_date = add_months (first_date, 1)
        last_date = add_months (first_date, 23)
        
    #End of "While last_date < end_date"
    
# Finished with the monte carlo simulations
mc_results_df.head()

# Drop the existing Monte Carlo table
sql_stmt = "DROP TABLE monte_carlo"
engine.execute(sql_stmt)

mc_results_df.to_sql(name='monte_carlo', con=engine)

print (f"\nFinished!")


Monte Carlo run number 1

First_date is 2016-04-30 and last_date is 2018-03-31. Current time is 00:13:40.
2837/1 - 1s - loss: 1.1245 - accuracy: 0.3602

First_date is 2016-05-31 and last_date is 2018-04-30. Current time is 00:14:43.
2900/1 - 1s - loss: 2.3355 - accuracy: 0.3483

First_date is 2016-06-30 and last_date is 2018-05-31. Current time is 00:15:32.
2863/1 - 0s - loss: 1.0830 - accuracy: 0.3664

First_date is 2016-07-31 and last_date is 2018-06-30. Current time is 00:16:23.
2858/1 - 1s - loss: 1.3054 - accuracy: 0.3586

First_date is 2016-08-31 and last_date is 2018-07-31. Current time is 00:17:14.
2856/1 - 1s - loss: 1.1356 - accuracy: 0.3673

First_date is 2016-09-30 and last_date is 2018-08-31. Current time is 00:18:10.
2900/1 - 1s - loss: 1.1139 - accuracy: 0.3793

First_date is 2016-10-31 and last_date is 2018-09-30. Current time is 00:19:03.
2865/1 - 1s - loss: 1.1975 - accuracy: 0.3647

First_date is 2016-11-30 and last_date is 2018-10-31. Current time is 00:19:58.
2882


First_date is 2017-02-28 and last_date is 2019-01-31. Current time is 01:05:41.
2846/1 - 0s - loss: 1.1345 - accuracy: 0.3538

First_date is 2017-03-31 and last_date is 2019-02-28. Current time is 01:06:29.
2902/1 - 0s - loss: 1.3916 - accuracy: 0.3442

First_date is 2017-04-30 and last_date is 2019-03-31. Current time is 01:07:14.
2893/1 - 1s - loss: 1.1036 - accuracy: 0.3387

First_date is 2017-05-31 and last_date is 2019-04-30. Current time is 01:08:00.
2896/1 - 1s - loss: 1.1171 - accuracy: 0.3622

First_date is 2017-06-30 and last_date is 2019-05-31. Current time is 01:08:48.
2904/1 - 0s - loss: 1.1564 - accuracy: 0.3750

First_date is 2017-07-31 and last_date is 2019-06-30. Current time is 01:09:35.
2899/1 - 0s - loss: 1.1492 - accuracy: 0.3522

First_date is 2017-08-31 and last_date is 2019-07-31. Current time is 01:10:23.
2875/1 - 1s - loss: 1.1498 - accuracy: 0.3430

First_date is 2017-09-30 and last_date is 2019-08-31. Current time is 01:11:11.
2885/1 - 0s - loss: 1.0489 - a


First_date is 2016-06-30 and last_date is 2018-05-31. Current time is 01:57:16.
2836/1 - 0s - loss: 1.1856 - accuracy: 0.3667

First_date is 2016-07-31 and last_date is 2018-06-30. Current time is 01:58:04.
2862/1 - 1s - loss: 1.1210 - accuracy: 0.3644

First_date is 2016-08-31 and last_date is 2018-07-31. Current time is 01:58:51.
2925/1 - 0s - loss: 1.0941 - accuracy: 0.3819

First_date is 2016-09-30 and last_date is 2018-08-31. Current time is 01:59:41.
2846/1 - 1s - loss: 1.0875 - accuracy: 0.3753

First_date is 2016-10-31 and last_date is 2018-09-30. Current time is 02:00:29.
2908/1 - 1s - loss: 1.8890 - accuracy: 0.3528

First_date is 2016-11-30 and last_date is 2018-10-31. Current time is 02:01:18.
2877/1 - 0s - loss: 1.1116 - accuracy: 0.3629

First_date is 2016-12-31 and last_date is 2018-11-30. Current time is 02:02:06.
2889/1 - 1s - loss: 1.0951 - accuracy: 0.3486

First_date is 2017-01-31 and last_date is 2018-12-31. Current time is 02:02:54.
2885/1 - 0s - loss: 1.2720 - a


First_date is 2017-04-30 and last_date is 2019-03-31. Current time is 02:50:39.
2880/1 - 0s - loss: 1.1254 - accuracy: 0.3528

First_date is 2017-05-31 and last_date is 2019-04-30. Current time is 02:51:30.
2859/1 - 1s - loss: 1.0974 - accuracy: 0.3554

First_date is 2017-06-30 and last_date is 2019-05-31. Current time is 02:52:20.
2914/1 - 1s - loss: 1.1136 - accuracy: 0.3682

First_date is 2017-07-31 and last_date is 2019-06-30. Current time is 02:53:10.
2888/1 - 1s - loss: 1.0768 - accuracy: 0.3636

First_date is 2017-08-31 and last_date is 2019-07-31. Current time is 02:54:00.
2901/1 - 1s - loss: 1.1824 - accuracy: 0.3768

First_date is 2017-09-30 and last_date is 2019-08-31. Current time is 02:54:50.
2916/1 - 1s - loss: 1.2585 - accuracy: 0.3536

Monte Carlo run number 12

First_date is 2016-04-30 and last_date is 2018-03-31. Current time is 02:55:42.
2844/1 - 1s - loss: 1.0920 - accuracy: 0.3558

First_date is 2016-05-31 and last_date is 2018-04-30. Current time is 02:56:33.
288


First_date is 2016-08-31 and last_date is 2018-07-31. Current time is 03:45:38.
2799/1 - 0s - loss: 1.1079 - accuracy: 0.3783

First_date is 2016-09-30 and last_date is 2018-08-31. Current time is 03:46:32.
2920/1 - 1s - loss: 1.0991 - accuracy: 0.3877

First_date is 2016-10-31 and last_date is 2018-09-30. Current time is 03:47:24.
2880/1 - 1s - loss: 1.1564 - accuracy: 0.3587

First_date is 2016-11-30 and last_date is 2018-10-31. Current time is 03:48:17.
2887/1 - 1s - loss: 1.1671 - accuracy: 0.3627

First_date is 2016-12-31 and last_date is 2018-11-30. Current time is 03:49:08.
2865/1 - 1s - loss: 1.0801 - accuracy: 0.3693

First_date is 2017-01-31 and last_date is 2018-12-31. Current time is 03:50:01.
2877/1 - 1s - loss: 1.1203 - accuracy: 0.3709

First_date is 2017-02-28 and last_date is 2019-01-31. Current time is 03:50:53.
2857/1 - 1s - loss: 1.0874 - accuracy: 0.3581

First_date is 2017-03-31 and last_date is 2019-02-28. Current time is 03:51:49.
2898/1 - 1s - loss: 1.1043 - a


First_date is 2017-06-30 and last_date is 2019-05-31. Current time is 04:44:00.
2855/1 - 1s - loss: 1.1225 - accuracy: 0.3597

First_date is 2017-07-31 and last_date is 2019-06-30. Current time is 04:44:55.
2917/1 - 1s - loss: 1.3160 - accuracy: 0.3514

First_date is 2017-08-31 and last_date is 2019-07-31. Current time is 04:45:49.
2891/1 - 1s - loss: 1.3116 - accuracy: 0.3490

First_date is 2017-09-30 and last_date is 2019-08-31. Current time is 04:46:45.
2927/1 - 1s - loss: 1.1002 - accuracy: 0.3820

Monte Carlo run number 19

First_date is 2016-04-30 and last_date is 2018-03-31. Current time is 04:47:44.
2867/1 - 1s - loss: 1.1199 - accuracy: 0.3715

First_date is 2016-05-31 and last_date is 2018-04-30. Current time is 04:48:39.
2842/1 - 1s - loss: 1.0663 - accuracy: 0.3825

First_date is 2016-06-30 and last_date is 2018-05-31. Current time is 04:49:35.
2882/1 - 1s - loss: 1.0454 - accuracy: 0.3727

First_date is 2016-07-31 and last_date is 2018-06-30. Current time is 04:50:30.
288


First_date is 2016-10-31 and last_date is 2018-09-30. Current time is 05:45:27.
2867/1 - 1s - loss: 1.1043 - accuracy: 0.3544

First_date is 2016-11-30 and last_date is 2018-10-31. Current time is 05:46:27.
2863/1 - 1s - loss: 1.0802 - accuracy: 0.3643

First_date is 2016-12-31 and last_date is 2018-11-30. Current time is 05:47:27.
2862/1 - 1s - loss: 1.1092 - accuracy: 0.3567

First_date is 2017-01-31 and last_date is 2018-12-31. Current time is 05:48:28.
2896/1 - 1s - loss: 1.0930 - accuracy: 0.3709

First_date is 2017-02-28 and last_date is 2019-01-31. Current time is 05:49:29.
2875/1 - 1s - loss: 1.1545 - accuracy: 0.3628

First_date is 2017-03-31 and last_date is 2019-02-28. Current time is 05:50:32.
2886/1 - 1s - loss: 1.0956 - accuracy: 0.3666

First_date is 2017-04-30 and last_date is 2019-03-31. Current time is 05:51:38.
2876/1 - 1s - loss: 1.1279 - accuracy: 0.3620

First_date is 2017-05-31 and last_date is 2019-04-30. Current time is 05:52:45.
2877/1 - 1s - loss: 1.0849 - a


First_date is 2017-08-31 and last_date is 2019-07-31. Current time is 06:57:32.
2857/1 - 1s - loss: 1.1049 - accuracy: 0.3567

First_date is 2017-09-30 and last_date is 2019-08-31. Current time is 06:58:45.
2868/1 - 1s - loss: 1.1031 - accuracy: 0.3741

Monte Carlo run number 26

First_date is 2016-04-30 and last_date is 2018-03-31. Current time is 06:59:57.
2790/1 - 1s - loss: 1.1238 - accuracy: 0.3627

First_date is 2016-05-31 and last_date is 2018-04-30. Current time is 07:01:09.
2825/1 - 1s - loss: 1.1932 - accuracy: 0.3788

First_date is 2016-06-30 and last_date is 2018-05-31. Current time is 07:02:23.
2858/1 - 1s - loss: 1.1017 - accuracy: 0.3258

First_date is 2016-07-31 and last_date is 2018-06-30. Current time is 07:03:37.
2848/1 - 1s - loss: 1.0885 - accuracy: 0.3585

First_date is 2016-08-31 and last_date is 2018-07-31. Current time is 07:04:47.
2921/1 - 1s - loss: 1.0894 - accuracy: 0.3584

First_date is 2016-09-30 and last_date is 2018-08-31. Current time is 07:05:56.
289


First_date is 2016-12-31 and last_date is 2018-11-30. Current time is 08:14:43.
2846/1 - 1s - loss: 1.1176 - accuracy: 0.3493

First_date is 2017-01-31 and last_date is 2018-12-31. Current time is 08:15:57.
2873/1 - 1s - loss: 1.0817 - accuracy: 0.3536

First_date is 2017-02-28 and last_date is 2019-01-31. Current time is 08:17:13.
2874/1 - 1s - loss: 1.1179 - accuracy: 0.3521

First_date is 2017-03-31 and last_date is 2019-02-28. Current time is 08:18:30.
2904/1 - 1s - loss: 1.1045 - accuracy: 0.3619

First_date is 2017-04-30 and last_date is 2019-03-31. Current time is 08:19:44.
2868/1 - 1s - loss: 1.1290 - accuracy: 0.3595

First_date is 2017-05-31 and last_date is 2019-04-30. Current time is 08:20:59.
2844/1 - 1s - loss: 1.1125 - accuracy: 0.3516

First_date is 2017-06-30 and last_date is 2019-05-31. Current time is 08:22:13.
2858/1 - 1s - loss: 1.0501 - accuracy: 0.3527

First_date is 2017-07-31 and last_date is 2019-06-30. Current time is 08:23:29.
2888/1 - 1s - loss: 1.1098 - a