Import libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import math
# from sklearn.linear_model import LinearRegression
# from sklearn.pipeline import make_pipeline

Change the number of rows and columns to display

In [2]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.precision', 4)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

Define a path for import and export

In [5]:
path = '/Users/admin/fin427/module4/'

Import data and create a quarter variable for alignment with quarterly financial information

In [6]:
returns1 = pd.read_excel(path + 'Excel03 Data 20230128.xlsx', sheet_name='ret06')

In [8]:
returns1['quarter'] = returns1['month'] + pd.offsets.QuarterEnd()
print(returns1.head())

   PERMNO       DATE     CUSIP                        COMNAM TICKER  PERMCO  \
0   50906 1995-01-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
1   50906 1995-02-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
2   50906 1995-03-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
3   50906 1995-04-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
4   50906 1995-05-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   

   SHRCD    VOL      RET     RETX  ...  d_4802_Finance_NEC  d_4803_Insurance  \
0     11  34941 -0.02000 -0.02000  ...                   0                 0   
1     11  60361  0.13776  0.13776  ...                   0                 0   
2     11  87371  0.05830  0.05830  ...                   0                 0   
3     11  78824  0.11864  0.11864  ...                   0                 0   
4     11  95578 -0.06061 -0.06061  ...                   0                 0   

   d_4885_Real_Estate_Dev  d_4890_REIT d_490

In [9]:
print(returns1.head())
print(returns1.columns)

   PERMNO       DATE     CUSIP                        COMNAM TICKER  PERMCO  \
0   50906 1995-01-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
1   50906 1995-02-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
2   50906 1995-03-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
3   50906 1995-04-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
4   50906 1995-05-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   

   SHRCD    VOL      RET     RETX  ...  d_4802_Finance_NEC  d_4803_Insurance  \
0     11  34941 -0.02000 -0.02000  ...                   0                 0   
1     11  60361  0.13776  0.13776  ...                   0                 0   
2     11  87371  0.05830  0.05830  ...                   0                 0   
3     11  78824  0.11864  0.11864  ...                   0                 0   
4     11  95578 -0.06061 -0.06061  ...                   0                 0   

   d_4885_Real_Estate_Dev  d_4890_REIT d_490

In [10]:
sales = pd.read_excel(path + 'SP400_Sales_20230131.xlsx', sheet_name='Sales')

In [11]:
sales1 = sales
sales1 = sales1.dropna()
sales1 = sales1[sales1['Sales/Turnover (Net)'] > 0]
sales1['sales'] = sales1['Sales/Turnover (Net)']
sales1['quarter'] = sales1['Data Date'] + pd.offsets.QuarterEnd()
sales1['leadquarter'] = sales1['Data Date'] + pd.offsets.QuarterEnd(1)
sales1['lnsales'] = np.log(sales1['sales']*1000)
univ_sales = sales1.sales.describe()
univ_lnsales = sales1.lnsales.describe()
sales1['cusip8'] = sales1['CUSIP'].str[0:8]
print(univ_sales)
print(univ_lnsales)
print(sales1.head())
print(sales1.columns)
print(sales1)

count   108646.00000
mean      1994.09869
std       5774.02505
min          0.00100
25%        265.24175
50%        733.04850
75%       1889.35075
max     263966.00000
Name: sales, dtype: float64
count   108646.00000
mean        13.40562
std          1.59087
min          0.00000
25%         12.48840
50%         13.50497
75%         14.45174
max         19.39133
Name: lnsales, dtype: float64
    Global Company Key  Data Date  Fiscal Year  Fiscal Quarter  \
0                 1013 1994-01-31         1994         1.00000   
4                 1013 1995-01-31         1995         1.00000   
8                 1013 1996-01-31         1996         1.00000   
9                 1013 1996-04-30         1996         2.00000   
10                1013 1996-07-31         1996         3.00000   

   Industry Format Level of Consolidation - Company Interim Descriptor  \
0             INDL                                                  C    
4             INDL                                           

Run an OLS regression of abnormal returns on characteristics in which the variables are transformed to z-scores. Industry indicator variables are not included.

In [12]:
y1 = returns1['abretadj']
x  = returns1[['lag1lnmc']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     890.2
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.57e-195
Time:                        14:44:18   Log-Likelihood:             2.3077e+05
No. Observations:              309269   AIC:                        -4.615e+05
Df Residuals:                  309267   BIC:                        -4.615e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0041      0.000     20.097      0.0

Run an LASSO regression of abnormal returns on characteristics in which the variables are transformed to z-scores. Industry indicator variables are not included.

In [13]:
y1 = returns1['abretadj']
x  = returns1[['lag1lnmc']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.001)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso_coef)
# print(pd.Series(lasso_coef, index = x.columns)) #This only works if the constant is excluded.

Lasso score:  0.0028
[ 0.         -0.00515549]


Run an OLS regression of abnormal returns on characteristics in which the variables are transformed to z-scores. Industry indicator variables are included. There is an intercept and one industry (6000 Miscellaneous) is omitted. The intercept is the baseline monthly return on 6000 Miscellaneous before consideration of relative market capitalisation, and the coeffficients on other industries are incremental returns associated with those industries.

In [28]:
y1 = returns1['abretadj']
x  = returns1[['lag1lnmc', 
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables', 
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services', 
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)
# print(pd.Series(ols_coef, index = x.columns))

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     45.21
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          1.84e-213
Time:                        15:27:07   Log-Likelihood:             2.3087e+05
No. Observations:              309269   AIC:                        -4.617e+05
Df Residuals:                  309244   BIC:                        -4.614e+05
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0041      0.000     20.103      0.0

Run a LASSO regression of abnormal returns on characteristics in which the variables are transformed to z-scores. Industry indicator variables are included. There is an intercept and one industry (6000 Miscellaneous) is omitted. The intercept is the baseline monthly return on 6000 Miscellaneous before consideration of relative market capitalisation, and the coeffficients on other industries are incremental returns associated with those industries.

In [29]:
y1 = returns1['abretadj']
x = returns1[['lag1lnmc', 
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables', 
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services', 
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.001)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso_coef)
print(x.columns)
# print(pd.Series(lasso_coef, index = x.columns)) #This only works if the constant is excluded.

Lasso score:  0.0029
[ 0.         -0.00514731 -0.         -0.          0.00014792 -0.
 -0.         -0.          0.00047536 -0.          0.         -0.
 -0.          0.0002078   0.         -0.          0.         -0.
 -0.         -0.          0.         -0.         -0.         -0.
 -0.        ]
Index(['lag1lnmc', 'd_1100_Non_Energy_Minerals',
       'd_1200_Producer_Manufacturing', 'd_1300_Electronic_Technology',
       'd_1400_Consumer_Durables', 'd_2100_Energy_Minerals',
       'd_2200_Process_Industries', 'd_2300_Health_Technology',
       'd_2400_Consumer_Non_Durables', 'd_3100_Industrial_Services',
       'd_3200_Commercial_Services', 'd_3250_Distribution_Services',
       'd_3300_Technology_Services', 'd_3350_Health_Services',
       'd_3400_Consumer_Services', 'd_3500_Retail_Trade',
       'd_4600_Transportation', 'd_4700_Utilities', 'd_4801_Banks',
       'd_4802_Finance_NEC', 'd_4803_Insurance', 'd_4885_Real_Estate_Dev',
       'd_4890_REIT', 'd_4900_Communications'],
      dty

Run an OLS regression of abnormal returns on characteristics in which the variables are transformed to z-scores, and do the same for untransformed variables. Including only variables identified as material by the LASSO regression. Industry indicator variables are not included.

In [16]:
y1 = returns1['abretadj']
x  = returns1[['lag1lnmc']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

x = sm.add_constant(x)
model2 = sm.OLS(y1, x).fit()
print_model2 = model2.summary()
ols_coef2 = model2.params
ols_rsq2  = model2.rsquared
print(print_model2)
print(f'R-squared: {model2.rsquared:.4f}')
print(ols_coef2)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     890.2
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.57e-195
Time:                        14:44:34   Log-Likelihood:             2.3077e+05
No. Observations:              309269   AIC:                        -4.615e+05
Df Residuals:                  309267   BIC:                        -4.615e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0041      0.000     20.097      0.0

Run an OLS regression of abnormal returns on characteristics in which the variables are transformed to z-scores, and do the same for untransformed variables. Including only variables identified as material by the LASSO regression. Industry indicator variables are included.

In [17]:
y1 = returns1['abretadj']
x  = returns1[['lag1lnmc', 'd_1300_Electronic_Technology', 'd_2300_Health_Technology', 'd_3300_Technology_Services']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

x = sm.add_constant(x)
model2 = sm.OLS(y1, x).fit()
print_model2 = model2.summary()
ols_coef2 = model2.params
ols_rsq2  = model2.rsquared
print(print_model2)
print(f'R-squared: {model2.rsquared:.4f}')
print(ols_coef2)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     255.2
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.70e-219
Time:                        14:44:37   Log-Likelihood:             2.3084e+05
No. Observations:              309269   AIC:                        -4.617e+05
Df Residuals:                  309264   BIC:                        -4.616e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0041      0.000     20.101      0.0

Merge returns1 and sales1 dataframes

In [18]:
combined1 = returns1.merge(sales1, how='inner', left_on=['CUSIP', 'quarter'], right_on=['cusip8', 'leadquarter'])
combined1.rename(columns={'sales': 'lag1sales', 'lnsales': 'lag1lnsales'}, inplace=True)
print(combined1.head())
print(combined1.columns)
print(len(returns1.index))
print(len(sales1.index))
print(len(combined1.index))
print(combined1.lag1sales.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1mc.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1lnsales.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1lnmc.describe(percentiles=[0.125, 0.875]))
print(combined1.retadj.describe(percentiles=[0.125, 0.875]))
print(combined1.bmret.describe(percentiles=[0.125, 0.875]))
print(combined1.abretadj.describe(percentiles=[0.125, 0.875]))

   PERMNO       DATE   CUSIP_x                        COMNAM TICKER  PERMCO  \
0   50906 1995-01-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
1   50906 1995-02-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
2   50906 1995-12-29  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
3   50906 1996-01-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
4   50906 1996-02-29  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   

   SHRCD     VOL      RET     RETX  ...  ISO Currency Code  \
0     11   34941 -0.02000 -0.02000  ...                USD   
1     11   60361  0.13776  0.13776  ...                USD   
2     11  100083 -0.19780 -0.19780  ...                USD   
3     11  196778  0.05822  0.05822  ...                USD   
4     11  144471  0.02913  0.02913  ...                USD   

   Calendar Data Year and Quarter  Fiscal Data Year and Quarter  \
0                          1994Q4                        1995Q1   
1                 

Compute the correlation between lnsales and lag1lnmc

In [19]:
dfcorr = combined1[['lag1lnsales', 'lag1lnmc']].copy()
dfcorr.corr(method='pearson')

Unnamed: 0,lag1lnsales,lag1lnmc
lag1lnsales,1.0,0.6323
lag1lnmc,0.6323,1.0


Run an OLS regression with two size variables, lag1lnmc and lag1lnsales. Run the regression again, with just lag1mc or lag1sales, and compare the results. Show how the positive correlation between market cap and sales leads to one coefficient being positive and another coefficient being negative. Repeat using z-scores.

In [20]:
y1 = combined1['abretadj']
# x  = combined1[['lag1lnmc']]
# x  = combined1[['lag1lnsales']]
x  = combined1[['lag1lnmc', 'lag1lnsales']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     446.5
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.53e-194
Time:                        14:44:41   Log-Likelihood:             2.2094e+05
No. Observations:              290802   AIC:                        -4.419e+05
Df Residuals:                  290799   BIC:                        -4.418e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const           0.0730      0.002     30.120      

Now incorporate industry indicator variables, and run the regressions again with two size variables.

In [21]:
y1 = combined1['abretadj']
x  = combined1[['lag1lnmc', 'lag1lnsales', 
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables', 
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services', 
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
# x  = combined1[['lag1lnmc']]
# x  = combined1[['lag1lnsales']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     43.92
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.50e-215
Time:                        14:44:43   Log-Likelihood:             2.2104e+05
No. Observations:              290802   AIC:                        -4.420e+05
Df Residuals:                  290776   BIC:                        -4.418e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

On the combined dataset, with two measures of size, run a LASSO regression of abnormal returns on characteristics in which the variables are transformed to z-scores. Industry indicator variables are excluded. There is an intercept and one industry (6000 Miscellaneous) is omitted. The intercept is the baseline monthly return on 6000 Miscellaneous before consideration of relative market capitalisation, and the coeffficients on other industries are incremental returns associated with those industries.

In [22]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc']]
# x = combined1[['lag1lnmc', 'lag1lnsales']]
# x = combined1[['lag1lnmc', 'lag1lnsales']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.001)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso_coef)
print(x.columns)
# print(pd.Series(lasso_coef, index = x.columns)) #This only works if the constant is excluded.

Lasso score:  0.0030
[ 0.         -0.00523841]
Index(['lag1lnmc'], dtype='object')


In [23]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc', 'lag1lnsales',
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables', 
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services', 
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.0014)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso_coef)
print(x.columns)
# print(pd.Series(lasso_coef, index = x.columns)) #This only works if the constant is excluded.

Lasso score:  0.0029
[ 0.         -0.00483841 -0.         -0.         -0.          0.
 -0.          0.         -0.          0.         -0.          0.
 -0.         -0.          0.          0.         -0.          0.
 -0.         -0.         -0.          0.         -0.         -0.
 -0.         -0.        ]
Index(['lag1lnmc', 'lag1lnsales', 'd_1100_Non_Energy_Minerals',
       'd_1200_Producer_Manufacturing', 'd_1300_Electronic_Technology',
       'd_1400_Consumer_Durables', 'd_2100_Energy_Minerals',
       'd_2200_Process_Industries', 'd_2300_Health_Technology',
       'd_2400_Consumer_Non_Durables', 'd_3100_Industrial_Services',
       'd_3200_Commercial_Services', 'd_3250_Distribution_Services',
       'd_3300_Technology_Services', 'd_3350_Health_Services',
       'd_3400_Consumer_Services', 'd_3500_Retail_Trade',
       'd_4600_Transportation', 'd_4700_Utilities', 'd_4801_Banks',
       'd_4802_Finance_NEC', 'd_4803_Insurance', 'd_4885_Real_Estate_Dev',
       'd_4890_REIT', 'd_4900_

In [24]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc', 'lag1lnsales']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.001)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso_coef)
print(x.columns)
# print(pd.Series(lasso_coef, index = x.columns)) #This only works if the constant is excluded.

Lasso score:  0.0030
[ 0.         -0.00523841 -0.        ]
Index(['lag1lnmc', 'lag1lnsales'], dtype='object')


OLS regression incorporating only those industries identified by LASSO as being material.

In [25]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc','lag1lnsales',
   'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables', 
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services', 
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     43.92
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.50e-215
Time:                        14:44:51   Log-Likelihood:             2.2104e+05
No. Observations:              290802   AIC:                        -4.420e+05
Df Residuals:                  290776   BIC:                        -4.418e+05
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0044      0.000     20.898      0.0

In [26]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc','lag1lnsales']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     446.5
Date:                Wed, 01 Feb 2023   Prob (F-statistic):          2.53e-194
Time:                        14:44:52   Log-Likelihood:             2.2094e+05
No. Observations:              290802   AIC:                        -4.419e+05
Df Residuals:                  290799   BIC:                        -4.418e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0044      0.000     20.892      0.0