# Lab 3: Model Selection and Best Subset Model
1. Download the dataset `factors` and `SP500`
2. Show summary statistics and graphical representation of the raw data.
3. Split data into training and testing samples. Use the first 80% as the training sample and the last 20% as the testing sample.
4. Evaluate (i.e. fit the model using training data, then calculate the MSE using the testing sample) a CAPM model
5. Evaluate a Fama French 3 Factor Model
6. Evaluate a Cahart 4 Factor Model
7. Compare the OOS MSE of those models
8. Do some self study on CAPM, Fama French 3 Factor and Cahart 4 factor models. 

In [2]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [3]:
factors = pd.read_csv('data/factors.csv')
sp500 = pd.read_csv('data/SP500.csv')

factors.head()
sp500.head()

Unnamed: 0,caldt,vwretd
0,20110103,0.011325
1,20110104,-0.001236
2,20110105,0.005164
3,20110106,-0.001715
4,20110107,-0.001755


In [4]:
factors.describe()

Unnamed: 0,date,mktrf,smb,hml,umd
count,2769.0,2769.0,2769.0,2769.0,2769.0
mean,20160700.0,0.000597,-1.7e-05,-0.000115,8.5e-05
std,31636.13,0.011011,0.005928,0.007497,0.009558
min,20110100.0,-0.12,-0.0357,-0.05,-0.1437
25%,20131000.0,-0.0036,-0.0035,-0.0036,-0.004
50%,20160700.0,0.0009,-0.0001,-0.0003,0.0005
75%,20190400.0,0.0057,0.0035,0.0031,0.0048
max,20211230.0,0.0934,0.055,0.0674,0.0593


In [5]:
sp500.describe()

Unnamed: 0,caldt,vwretd
count,2769.0,2769.0
mean,20160700.0,0.000618
std,31636.13,0.010744
min,20110100.0,-0.11897
25%,20131000.0,-0.003295
50%,20160700.0,0.000778
75%,20190400.0,0.005421
max,20211230.0,0.093205


In [6]:
factors.rename(columns={'date': 'caldt'}, inplace=True)
factors['caldt'] = pd.to_datetime(factors['caldt'], format='%Y%m%d', errors='coerce')
sp500['caldt'] = pd.to_datetime(sp500['caldt'], format='%Y%m%d', errors='coerce')

merged_df = pd.merge(sp500, factors, on='caldt', how='inner')

In [7]:
merged_df.describe()

Unnamed: 0,caldt,vwretd,mktrf,smb,hml,umd
count,2769,2769.0,2769.0,2769.0,2769.0,2769.0
mean,2016-07-03 07:18:23.791982592,0.000618,0.000597,-1.7e-05,-0.000115,8.5e-05
min,2011-01-03 00:00:00,-0.11897,-0.12,-0.0357,-0.05,-0.1437
25%,2013-10-03 00:00:00,-0.003295,-0.0036,-0.0035,-0.0036,-0.004
50%,2016-07-05 00:00:00,0.000778,0.0009,-0.0001,-0.0003,0.0005
75%,2019-04-04 00:00:00,0.005421,0.0057,0.0035,0.0031,0.0048
max,2021-12-31 00:00:00,0.093205,0.0934,0.055,0.0674,0.0593
std,,0.010744,0.011011,0.005928,0.007497,0.009558


In [12]:
X_capm = merged_df[['mktrf']]
X_3f = merged_df[['mktrf', 'smb', 'hml']]
X_Carhart4f = merged_df[['mktrf', 'smb', 'hml', 'umd']]

y = merged_df['vwretd']

X_capm_train, X_capm_test, y_train, y_test = train_test_split(X_capm, y, test_size=0.2, shuffle=False)
X_3f_train, X_3f_test, _, _ = train_test_split(X_3f, y, test_size=0.2, shuffle=False)
X_Carhart4f_train, X_Carhart4f_test, _, _ = train_test_split(X_Carhart4f, y, test_size=0.2, shuffle=False)

print(f'Training set size (CAPM): {X_capm_train.shape[0]}')
print(f'Testing set size (CAPM): {X_capm_test.shape[0]}')

Training set size (CAPM): 2215
Testing set size (CAPM): 554


In [13]:
# Add the constant for OLS
X_capm_train = sm.add_constant(X_capm_train)
X_capm_test = sm.add_constant(X_capm_test)

X_3f_train = sm.add_constant(X_3f_train)
X_3f_test = sm.add_constant(X_3f_test)

X_Carhart4f_train = sm.add_constant(X_Carhart4f_train)
X_Carhart4f_test = sm.add_constant(X_Carhart4f_test)

# fit the models
capm = sm.OLS(y_train, X_capm_train).fit()
f3 = sm.OLS(y_train, X_3f_train).fit()
f4 = sm.OLS(y_train, X_Carhart4f_train).fit()

print('CAPM Model Summary:')
print(capm.summary())
print('\nFama-French 3 Factor Model Summary:')
print(f3.summary())
print('\nCarhart 4-Factor Model Summary:')
print(f4.summary())

CAPM Model Summary:
                            OLS Regression Results                            
Dep. Variable:                 vwretd   R-squared:                       0.993
Model:                            OLS   Adj. R-squared:                  0.993
Method:                 Least Squares   F-statistic:                 3.084e+05
Date:                Fri, 07 Mar 2025   Prob (F-statistic):               0.00
Time:                        15:05:05   Log-Likelihood:                 12738.
No. Observations:                2215   AIC:                        -2.547e+04
Df Residuals:                    2213   BIC:                        -2.546e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       4.464e-05   1.64e-05

In [15]:
# Evaluate the models using MSE

capm_pred = capm.predict(X_capm_test)
f3_pred = f3.predict(X_3f_test)
f4_pred = f4.predict(X_Carhart4f_test)

capm_mse = mean_squared_error(y_test, capm_pred)
f3_mse = mean_squared_error(y_test, f3_pred)
f4_mse = mean_squared_error(y_test, f4_pred)

mse_results = pd.DataFrame({
    'Model': ['CAPM', 'Fama-French 3 Factor', 'Carhart 4 Factor'],
    'OOS MSE': [capm_mse, f3_mse, f4_mse]
})

print(mse_results)

                  Model   OOS MSE
0                  CAPM  0.000002
1  Fama-French 3 Factor  0.000001
2      Carhart 4 Factor  0.000001
