In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
import math

In [2]:
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_colwidth', 50)
pd.set_option('display.precision', 4)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
path = '/Users/admin/fin427/project1/'

# Import data and create a quarter variable for alignment with quarterly financial information
returns = pd.read_excel(path + 'Excel03 Data 20230128.xlsx', sheet_name='ret06')
returns1 = returns
returns1['quarter'] = returns1['month'] + pd.offsets.QuarterEnd(0)
print(returns1.head(6))
print(returns1.columns)

   PERMNO       DATE     CUSIP                        COMNAM TICKER  PERMCO  \
0   50906 1995-01-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
1   50906 1995-02-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
2   50906 1995-03-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
3   50906 1995-04-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
4   50906 1995-05-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
5   50906 1995-06-30  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   

   SHRCD     VOL      RET     RETX  ...  d_4802_Finance_NEC  d_4803_Insurance  \
0     11   34941 -0.02000 -0.02000  ...                   0                 0   
1     11   60361  0.13776  0.13776  ...                   0                 0   
2     11   87371  0.05830  0.05830  ...                   0                 0   
3     11   78824  0.11864  0.11864  ...                   0                 0   
4     11   95578 -0.06061 -0.06061  ...  

In [4]:
sales = pd.read_excel(path + 'SP400_Sales_20230131.xlsx', sheet_name='Sales')
sales1 = sales
sales1 = sales1.dropna()
sales1 = sales1[sales1['Sales/Turnover (Net)'] > 0]
sales1['sales'] = sales1['Sales/Turnover (Net)']
sales1['quarter'] = sales1['Data Date'] + pd.offsets.QuarterEnd(0)
sales1['leadquarter'] = sales1['quarter'] + pd.offsets.QuarterEnd(1)
sales1['lnsales'] = np.log(sales1['sales']*1000)
univ_sales = sales1.sales.describe()
univ_lnsales = sales1.lnsales.describe()
sales1['cusip8'] = sales1['CUSIP'].str[0:8]
print(univ_sales)
print(univ_lnsales)
print(sales1.head(50))
print(sales1.columns)
print(sales1)

count   108646.00000
mean      1994.09869
std       5774.02505
min          0.00100
25%        265.24175
50%        733.04850
75%       1889.35075
max     263966.00000
Name: sales, dtype: float64
count   108646.00000
mean        13.40562
std          1.59087
min          0.00000
25%         12.48840
50%         13.50497
75%         14.45174
max         19.39133
Name: lnsales, dtype: float64
    Global Company Key  Data Date  Fiscal Year  Fiscal Quarter  \
0                 1013 1994-01-31         1994         1.00000   
4                 1013 1995-01-31         1995         1.00000   
8                 1013 1996-01-31         1996         1.00000   
9                 1013 1996-04-30         1996         2.00000   
10                1013 1996-07-31         1996         3.00000   
11                1013 1996-10-31         1996         4.00000   
12                1013 1997-01-31         1997         1.00000   
13                1013 1997-04-30         1997         2.00000   
14          

In [19]:
#Create RSI Characteristic 
rollingAmt = 14
returns1['Up'] = np.where(returns1['RET'] >= 0, 1, 0)
returns1['Down'] = np.where(returns1['RET'] < 0, 1, 0)
#returns1["Up"] = pd.to_numeric(returns1["Up"])
returns1['UpDownRatio'] = returns1['Up'].rolling(14).mean() / returns1['Down'].rolling(rollingAmt).mean()
returns1['RSI'] = 100 - 100 / ( 1 + returns1['UpDownRatio'] )
returns1["RSI"] = returns1["RSI"].fillna(50)
returns1['RSI'] = returns1['RSI'].shift(-1)
returns1["RSI"] = returns1["RSI"].fillna(50)

In [20]:
combined1 = returns1.merge(sales1, how='inner', left_on=['CUSIP', 'quarter'], right_on=['cusip8', 'leadquarter'])
combined1.rename(columns={'sales': 'lag1sales', 'lnsales': 'lag1lnsales'}, inplace=True)
print(combined1.head())
print(combined1.columns)
print(len(returns1.index))
print(len(sales1.index))
print(len(combined1.index))
print(combined1.lag1sales.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1mc.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1lnsales.describe(percentiles=[0.125, 0.875]))
print(combined1.lag1lnmc.describe(percentiles=[0.125, 0.875]))
print(combined1.retadj.describe(percentiles=[0.125, 0.875]))
print(combined1.bmret.describe(percentiles=[0.125, 0.875]))
print(combined1.abretadj.describe(percentiles=[0.125, 0.875]))

# Compute the correlation between lnsales and lag1lnmc
dfcorr = combined1[['lag1lnsales', 'lag1lnmc']].copy()
dfcorr.corr(method='pearson')

   PERMNO       DATE   CUSIP_x                        COMNAM TICKER  PERMCO  \
0   50906 1995-04-28  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
1   50906 1995-05-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
2   50906 1995-06-30  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
3   50906 1996-04-30  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   
4   50906 1996-05-31  00088630  A D C TELECOMMUNICATIONS INC   ADCT    2902   

   SHRCD     VOL      RET     RETX  ...  ISO Currency Code  \
0     11   78824  0.11864  0.11864  ...                USD   
1     11   95578 -0.06061 -0.06061  ...                USD   
2     11  119561  0.15323  0.15323  ...                USD   
3     11  126525  0.21739  0.21739  ...                USD   
4     11  133393  0.09524  0.09524  ...                USD   

   Calendar Data Year and Quarter  Fiscal Data Year and Quarter  \
0                          1994Q4                        1995Q1   
1                 

Unnamed: 0,lag1lnsales,lag1lnmc
lag1lnsales,1.0,0.62971
lag1lnmc,0.62971,1.0


In [21]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc', 'lag1lnsales', 'RSI',
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables',
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services',
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x).fit()
# model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

                            OLS Regression Results                            
Dep. Variable:               abretadj   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     335.3
Date:                Mon, 13 Feb 2023   Prob (F-statistic):               0.00
Time:                        15:49:18   Log-Likelihood:             2.2407e+05
No. Observations:              290255   AIC:                        -4.481e+05
Df Residuals:                  290228   BIC:                        -4.478e+05
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                                    coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------------
const         

In [17]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc', 'lag1lnsales', 'RSI',
    'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables',
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services',
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x1 = sm.add_constant(x1)
lasso = Lasso(alpha = 0.001)
lasso.fit(x1, y1)
lasso_coef = lasso.fit(x1, y1).coef_
lasso_score = lasso.score(x1, y1)
print(f'Lasso score: {lasso_score: .4f}')
print(lasso.intercept_)
print(lasso_coef)
print(x.columns)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [18]:
y1 = combined1['abretadj']
x = combined1[['lag1lnmc','lag1lnsales', 'RSI', 
   'd_1100_Non_Energy_Minerals',
    'd_1200_Producer_Manufacturing',
    'd_1300_Electronic_Technology',
    'd_1400_Consumer_Durables',
    'd_2100_Energy_Minerals',
    'd_2200_Process_Industries',
    'd_2300_Health_Technology',
    'd_2400_Consumer_Non_Durables',
    'd_3100_Industrial_Services',
    'd_3200_Commercial_Services',
    'd_3250_Distribution_Services',
    'd_3300_Technology_Services',
    'd_3350_Health_Services',
    'd_3400_Consumer_Services',
    'd_3500_Retail_Trade',
    'd_4600_Transportation',
    'd_4700_Utilities',
    'd_4801_Banks',
    'd_4802_Finance_NEC',
    'd_4803_Insurance',
    'd_4885_Real_Estate_Dev',
    'd_4890_REIT',
    'd_4900_Communications']]
x1 = StandardScaler().fit_transform(x)
x = sm.add_constant(x)
x1 = sm.add_constant(x1)
model = sm.OLS(y1, x1).fit()
# model = sm.OLS(y1, x1).fit()
print_model = model.summary()
ols_coef = model.params
ols_rsq  = model.rsquared
print(print_model)
print(f'R-squared: {model.rsquared:.4f}')
print(ols_coef)

MissingDataError: exog contains inf or nans