In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LinearRegression
from scipy import stats
import econtools.metrics as mt

In [2]:
def t_test(est_beta, hyp_beta, errors, x):
    n = len(x)
    x_mean = np.mean(x)
    se_numerator = np.sum(np.square(errors)) / (n - 2)
    se_denominator = np.sum(np.square(x - x_mean))
    se = np.sqrt(se_numerator / se_denominator)
    t_value = (est_beta - hyp_beta) / se
    df = (2 * n) - 2
    p_value = 1 - stats.t.cdf(np.abs(t_value), df=df)
    return (t_value, p_value, se)

In [3]:
# Reading in main data
data = pd.read_csv('data.csv')
data = data[['TICKER', 'date', 'RET', 'sprtrn']]

# Getting rid of tickers without the full 61 months of data
sizes = data.groupby('TICKER').size()
to_keep = sizes.loc[sizes == 61].index.tolist()
data = data.loc[data['TICKER'].isin(to_keep)]
data = data.loc[(data['TICKER'] != 'GENE') & (data['TICKER'] != 'RDIB') & (data['TICKER'] != 'NWSA')].reset_index(drop=True)

# Formatting my date column for future merge
data['date'] = pd.to_datetime(data['date'])
data['date'] = data['date'].dt.strftime('%b %Y')

# Reading in the risk free rate data
riskfree = pd.read_csv('riskfreerate.csv')

# Formatting my date column for future merge
riskfree['caldt'] = pd.to_datetime(riskfree['caldt'], format='%Y%m%d')
riskfree['caldt'] = riskfree['caldt'].dt.strftime('%b %Y')

# Renaming my columns for to call them easier in code
data.rename(columns={'TICKER': 'ticker', 'date': 'month', 'RET': 'hp_return', 'sprtrn': 'sp_return'}, inplace=True)
riskfree.rename(columns={'caldt': 'month', 't30ret': 'risk_free_rate'}, inplace=True)

# Merging main data and risk free rate
dataset = data.merge(riskfree, how='left', on='month')

# Converting strings to floats
to_float = ['hp_return', 'sp_return', 'risk_free_rate']   
for column in to_float:
    dataset[column] = dataset[column].astype(float)

# Creating variables of interest
dataset['hp_diff'] = dataset['hp_return'] - dataset['risk_free_rate']
dataset['sp_diff'] = dataset['sp_return'] - dataset['risk_free_rate']

# Preview dataframe
dataset.head()

Unnamed: 0,ticker,month,hp_return,sp_return,risk_free_rate,hp_diff,sp_diff
0,PLXS,Jan 2010,0.194171,-0.036974,2.8e-05,0.194143,-0.037002
1,PLXS,Feb 2010,0.014114,0.028514,1e-05,0.014104,0.028504
2,PLXS,Mar 2010,0.044651,0.058796,6.8e-05,0.044583,0.058728
3,PLXS,Apr 2010,0.029142,0.014759,0.000111,0.029031,0.014648
4,PLXS,May 2010,-0.081715,-0.081976,0.000112,-0.081827,-0.082088


In [4]:
# Getting a random 200 tickers
tickers = dataset['ticker'].unique().tolist()
random.seed(a=42)
tickers_to_keep = random.sample(range(0, len(tickers)), k=200)
ticker_list = [tickers[i] for i in tickers_to_keep]
ticker_list

# Creating a dictionary with the ticker name as a key and it's corresponding dataframe as a value
ticker_sets = {}
for ticker in ticker_list:
    ticker_sets[ticker] = dataset.loc[dataset['ticker'] == ticker].reset_index(drop=True)

# Instantiating sklearn's LinearRegression() class and preparing to record mass results
model = LinearRegression()
idx = 0
final_df = pd.DataFrame(columns=['Ticker', 'Alpha', 'Beta', 'Std Error', 'T-Value', 'P-Value', 'Variance of Residuals', 'Average Return'])

# Running a regression for each dataframe in the ticker_sets dictionary and recording the results to final_df
for ticker, df in ticker_sets.items():
    x = df['sp_diff'].values.reshape(-1, 1)
    y = df['hp_diff'].values.reshape(-1, 1)
    model.fit(x, y)
    predicted = model.predict(x)
    errors = predicted - y
    alpha = model.intercept_.item()
    beta = model.coef_.item()
    t_results = t_test(beta, 1, errors, x)
    se = t_results[2]
    t_value = t_results[0]
    p_value = t_results[1]
    var = (np.var(np.abs(errors))).item()
    average_return = df['hp_diff'].mean()
    final_df.loc[idx] = [ticker, alpha, beta, se, t_value, p_value, var, average_return]
    idx += 1

# Preview dataframe
final_df.head()

Unnamed: 0,Ticker,Alpha,Beta,Std Error,T-Value,P-Value,Variance of Residuals,Average Return
0,MGLN,0.000852,0.655676,0.149272,-2.306695,0.011393,0.000678,0.007561
1,KLIC,0.008369,1.5563,0.386957,1.437626,0.076571,0.006022,0.024294
2,PMD,0.01024,0.807335,0.240695,-0.800452,0.212515,0.00237,0.018501
3,GPOR,0.008598,1.949649,0.380151,2.498086,0.00692,0.004402,0.028548
4,FLL,-0.015778,0.688685,0.383035,-0.812759,0.208983,0.005786,-0.008731


In [5]:
# Using econtools to run second-pass regression
results = mt.reg(final_df, 'Average Return', ['Beta', 'Variance of Residuals'], addcons=True)
results.summary

Unnamed: 0,coeff,se,t,p>t,CI_low,CI_high
Beta,0.000514,0.001617,0.317649,0.7510874,-0.002675,0.003703
Variance of Residuals,0.277903,0.091602,3.033818,0.002740293,0.097257,0.458549
_cons,0.011824,0.001956,6.046013,7.320531e-09,0.007967,0.015681


In [6]:
# Performing a specific t-test for the 'Beta' term to see if it's statistically different from the average market return
mean_returns = dataset[['month', 'sp_diff']].drop_duplicates(keep='first').mean().loc['sp_diff']
beta_coeff = results.beta['Beta']
se = results.se['Beta']

t_test = (beta_coeff - mean_returns) / se
p_value = 1 - (stats.t.cdf(np.abs(t_test), (2 * len(final_df)) - 2))

print('Average Market Return: {}'.format(mean_returns))
print('Beta Coefficient: {}'.format(beta_coeff))
print('T-Value: {}'.format(t_test))
print('P-Value: {}\n'.format(p_value))

Average Market Return: 0.010232557377049176
Beta Coefficient: 0.0005136403805517942
T-Value: -6.010444227744648
P-Value: 2.0924750820938698e-09



Beta Result: Statistically different from average return on the market.  
Variance of Residual: Statistically different from zero.  
Intercept: Statistically different from zero.  

# Reject CAP-M Model