In [1]:
# Dependencies
import pandas as pd
import math
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

In [2]:
# Connect to the SQLite database
database_name = "data/CompanyData.sqlite"
engine = create_engine(f"sqlite:///{database_name}", echo=False)

In [3]:
# Load the monte_carlo file from the database
df = pd.read_sql_query("SELECT * FROM monte_carlo", engine)

# Take a look at the data
df.head()

Unnamed: 0,index,mc_run,monthend_date,avg_return,g1_return,g2_return,g3_return,n_stocks,g1_count,g2_count,g3_count,model_loss,model_accuracy
0,0,1,2018-03-31,0.068611,0.967868,-0.468328,0.371416,120,32,66,22,1.123912,0.36024
1,1,1,2018-04-30,1.504863,-6.803393,1.561338,5.202178,122,3,114,5,3.561095,0.348276
2,2,1,2018-05-31,1.125,1.514812,0.051406,2.304121,120,28,53,39,1.110806,0.366399
3,3,1,2018-06-30,3.835372,4.48923,3.821118,2.286639,121,60,36,25,1.515231,0.358642
4,4,1,2018-07-31,2.049188,3.114543,1.024453,3.098762,120,37,61,22,1.1153,0.367297


In [4]:
# Calculate the average model_loss and model_accuracy
avg_accuracy = df['model_accuracy'].mean()
avg_loss = df['model_loss'].mean()
print (f"The average model loss was {avg_loss} and the average model accuracy was {avg_accuracy}.")

The average model loss was 1.225797259984777 and the average model accuracy was 0.36566835823986266.


In [5]:
# To make subsequent coding easier, convert the dataframe by adding a column for the "group", and collapse the 
# returns and count of stocks into one column
monthly_detail_df = pd.DataFrame()

for i, row in df.iterrows():
    
    # Add a row for the whole group
    data_row_df = pd.DataFrame({"mc_run": row["mc_run"], "monthend_date": row["monthend_date"], "quantile": 0,
                                "return": row["avg_return"], "n_stocks": row["n_stocks"]}, index=[0])
    monthly_detail_df = monthly_detail_df.append(data_row_df, ignore_index=True)
    
    # Add a row for Tertile 1
    data_row_df = pd.DataFrame({"mc_run": row["mc_run"], "monthend_date": row["monthend_date"], "quantile": 1,
                                "return": row["g1_return"], "n_stocks": row["g1_count"]}, index=[0])
    monthly_detail_df = monthly_detail_df.append(data_row_df, ignore_index=True)
    
    # Add a row for Tertile 2
    data_row_df = pd.DataFrame({"mc_run": row["mc_run"], "monthend_date": row["monthend_date"], "quantile": 2,
                                "return": row["g2_return"], "n_stocks": row["g2_count"]}, index=[0])
    monthly_detail_df = monthly_detail_df.append(data_row_df, ignore_index=True)
    
    # Add a row for Tertile 3
    data_row_df = pd.DataFrame({"mc_run": row["mc_run"], "monthend_date": row["monthend_date"], "quantile": 3,
                                "return": row["g3_return"], "n_stocks": row["g3_count"]}, index=[0])
    monthly_detail_df = monthly_detail_df.append(data_row_df, ignore_index=True)

# The new dataset
monthly_detail_df.head(8)


Unnamed: 0,mc_run,monthend_date,quantile,return,n_stocks
0,1,2018-03-31,0,0.068611,120
1,1,2018-03-31,1,0.967868,32
2,1,2018-03-31,2,-0.468328,66
3,1,2018-03-31,3,0.371416,22
4,1,2018-04-30,0,1.504863,122
5,1,2018-04-30,1,-6.803393,3
6,1,2018-04-30,2,1.561338,114
7,1,2018-04-30,3,5.202178,5


In [6]:
# Calculate the average returns for each month
monthly_df = monthly_detail_df.groupby(["monthend_date", "quantile"])['return'].mean().reset_index()
monthly_df.head()

Unnamed: 0,monthend_date,quantile,return
0,2018-03-31,0,0.216085
1,2018-03-31,1,0.692525
2,2018-03-31,2,-0.446196
3,2018-03-31,3,0.994569
4,2018-04-30,0,1.733591


In [7]:
# To calculate the average annual return, we need to link the monthly returns by converting each return
# into a (1 + return/100) format, multiplying the series of returns, then taking the (12/'n')th root to find 
# the average annual return, where 'n' is the number of months

# The easy math trick way to do it is to take the log of (1 + return), adding the returns, dividing by (12/'n'),
# then converting back to "real" numbers by raising e to the power of that number

# Calculate the log of each return, store it in a new column called "log_*_return"
monthly_df["log_return"] = [math.log(1 + x/100) for x in monthly_df["return"]]

# Calculate the cumulative sum of the log_return
monthly_df["sum_log_return"] = monthly_df.groupby('quantile')['log_return'].cumsum()

# Convert the 'sum_log_return' column to a 'wealth_index' by raising e to the power of 'sum_log', then multiplying by 100
monthly_df["wealth_index"] = [(math.exp(x))*100 for x in monthly_df["sum_log_return"]]

# Drop the "sum_log" column
monthly_df.drop(columns=["sum_log_return"], inplace=True)
monthly_df.head()

# Save to a csv file
monthly_df.to_csv ("monthly_returns.csv")

In [8]:
# Now calculate the average annual return for all quantiles
avg_returns_df = monthly_df.groupby(['quantile'])['log_return'].mean().reset_index()
avg_returns_df["log_return"] = avg_returns_df["log_return"] * 12

# Now raise e to the power of the log_return, subtract 1 and multiply by 100.
# That results in average annual return
avg_returns_df["avg_return"] = [(math.exp(x)-1)*100 for x in avg_returns_df["log_return"]]

# Drop the "log_return" column
avg_returns_df.drop(columns="log_return", inplace=True)
avg_returns_df.head()

# Save to a csv file
avg_returns_df.to_csv ("avg_returns.csv")

In [9]:
# By the way, the average annual return of the S&P 500 from 1/31/18 through 8/31/19 was 6.7%
# Using SPY Adjusted Close (so it includes dividends) from Yahoo Finance
# 1/31/18 = 272.84, 8/31/19 = 291.11, percent change is 6.7%
# 
# The S&P 500 is a market-cap weighted index.  This study assumes equal weights for the stocks.
# Thus, there is a "small-cap" bias in the results.  I'm not sure if that helped or hurt the analysis.
# Depends on how the size factor impacted the market over this time period.

In [10]:
avg_returns_df.head()

Unnamed: 0,quantile,avg_return
0,0,8.94677
1,1,10.147954
2,2,8.643791
3,3,5.407917


In [11]:
# Calculate the monthly excess return versus the market, and see if there is a
# statistiacal difference between the groups, using the t-test
df['alpha_1'] = df['g1_return'] - df['avg_return']
df['alpha_2'] = df['g2_return'] - df['avg_return']
df['alpha_3'] = df['g3_return'] - df['avg_return']

# Create lists from the dataframe columns
alpha_1_list = df['alpha_1'].tolist()
alpha_2_list = df['alpha_2'].tolist()
alpha_3_list = df['alpha_3'].tolist()

# t-test to see if group 1 is different from group 2
(t_stat, p) = ttest_ind(alpha_1_list, alpha_2_list, equal_var=False)
print (f"T-test for g1 vs g2 is {t_stat} with a p value of {p}")

# t-test to see if group 1 is different from group 3
(t_stat, p) = ttest_ind(alpha_1_list, alpha_3_list, equal_var=False)
print (f"T-test for g1 vs g3 is {t_stat} with a p value of {p}")

# t-test to see if group 2 is different from group 3
(t_stat, p) = ttest_ind(alpha_2_list, alpha_3_list, equal_var=False)
print (f"T-test for g2 vs g3 is {t_stat} with a p value of {p}")


T-test for g1 vs g2 is 2.006934566706936 with a p value of 0.04501438481478164
T-test for g1 vs g3 is 3.6580419068977603 with a p value of 0.0002665568779642431
T-test for g2 vs g3 is 2.0562061643221563 with a p value of 0.04002148819955466
