In [1]:
#Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('us2022q2a.csv')
data

Unnamed: 0,firm,q,revenue,cogs,sgae,otheropexp,extraincome,finexp,incometax,totalassets,totalliabilities,shortdebt,longdebt,stockholderequity,adjprice,originalprice,sharesoutstanding,fiscalmonth,year,cto
0,A,2000q1,,,,,,,,,,,,,63.761670,104.0000,452000.000,,2000.0,1.0
1,A,2000q2,2485000.0,1261000.0,1.010000e+06,0.0,42000.000000,0.000,90000.0,7321000.000,2679000.000,512000.000,0.000,4642000.000,45.215607,73.7500,452271.967,6.0,2000.0,2.0
2,A,2000q3,2670000.0,1369000.0,1.091000e+06,0.0,28000.000000,0.000,83000.0,7827000.000,2925000.000,528000.000,0.000,4902000.000,30.003238,48.9375,453014.579,9.0,2000.0,3.0
3,A,2000q4,3372000.0,1732000.0,1.182000e+06,0.0,10000.000000,0.000,163000.0,8425000.000,3160000.000,830000.000,0.000,5265000.000,33.566841,54.7500,456366.381,12.0,2000.0,4.0
4,A,2001q1,2841000.0,1449000.0,1.113000e+06,0.0,-6000.000000,0.000,119000.0,9208000.000,3667000.000,556000.000,0.000,5541000.000,18.840347,30.7300,456769.737,3.0,2001.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324442,ZYNE,2021q3,0.0,0.0,1.021065e+04,0.0,-376.636750,-5.038,0.0,89996.170,11309.171,207.635,411.237,78686.999,4.240000,4.2400,41251.537,9.0,2021.0,3.0
324443,ZYNE,2021q4,0.0,0.0,8.836436e+03,0.0,16.937906,-4.433,0.0,81171.507,10258.173,209.068,353.694,70913.334,2.880000,2.8800,41217.537,12.0,2021.0,4.0
324444,ZYNE,2022q1,0.0,0.0,8.903915e+03,0.0,317.252110,-96.044,0.0,74381.029,9214.059,210.512,295.754,65166.970,2.050000,2.0500,42447.037,3.0,2022.0,1.0
324445,ZYNE,2022q2,0.0,0.0,9.168770e+03,0.0,-775.927860,-91.691,0.0,67006.959,9765.086,211.965,237.414,57241.873,1.140000,1.1400,43595.959,6.0,2022.0,2.0


In [3]:
name_columns = ["firm","Name","N","Class","Country of Origin","Type of Asset","Sector NAICS level 1","Exchange / Src","Sector Economatica","Sector NAICS last available","partind"]
df_firms = pd.read_csv('usfirms2022.csv', names = name_columns)
df_firms = df_firms.drop(["N","Class","Country of Origin","Type of Asset","Exchange / Src","Sector Economatica","Sector NAICS last available","partind"], axis=1)
df_firms

Unnamed: 0,firm,Name,Sector NAICS level 1
0,Ticker,Name,Sector NAICS\nlevel 1
1,FLWS,1 800 Flowers Com Inc,Retail Trade
2,TXG,"10x Genomics, Inc",Manufacturing
3,GOED,1847 Goedeker Inc,Retail Trade
4,ONEM,"1life Healthcare, Inc",Health Care and Social Assistance
...,...,...,...
3604,ZUO,"Zuora, Inc",Information
3605,ZWS,Zurn Elkay Water Solutions Corp,Manufacturing
3606,ZY,Zymergen Inc,"Professional, Scientific, and Technical Services"
3607,ZYNE,"Zynerba Pharmaceuticals, Inc",Manufacturing


In [4]:
#merge both data frames
data = data.merge(df_firms, how='left', on='firm')

In [None]:
#change the format of the date column 
data['q'] = pd.PeriodIndex(data['q'],freq = "q")

In [None]:
#setting the multindex
data.set_index(['firm','q'],inplace=True)

In [5]:
data ["Market Value"] = data["originalprice"]*data["sharesoutstanding"]
data["Book Value"] = data["totalassets"]-data["totalliabilities"]
data["Ebit"] = data["revenue"] - data["cogs"] - data["sgae"]
data["originalprice"] = data["originalprice"].replace(0,np.nan)
data["totalassets"] = data["totalassets"].replace(0,np.nan)
data["OPM_sum"] = data["Ebit"] + data["revenue"]
data["Op_profitmargin"] = data["OPM_sum"]/data["revenue"]  #Porcentaje
data["Op_profitmargin"] = data["Op_profitmargin"].replace([np.inf, -np.inf], np.nan)  #Porcentaje
data["net_income"] = data["Ebit"] - data["otheropexp"] - data["incometax"] - data["finexp"] + data["extraincome"]
#pd.options.display.float_format = "{:,.2f}".format
data["Profit margin"] = (data["net_income"] / data["revenue"])  #Porcentaje
data["Profit margin"] = data["Profit margin"].replace([np.inf, -np.inf], np.nan)


In [6]:
#Get stock returns (annual)
data["r"] = np.log(data["adjprice"]) - np.log(data["adjprice"].shift(4))
data[["adjprice","r"]]

Unnamed: 0,adjprice,r
0,63.761670,
1,45.215607,
2,30.003238,
3,33.566841,
4,18.840347,-1.219152
...,...,...
324442,4.240000,0.247615
324443,2.880000,-0.136132
324444,2.050000,-0.819027
324445,1.140000,-1.534790


In [7]:
data["F1_ret"] = data["r"].shift(-1)
data[["r","F1_ret"]]

Unnamed: 0,r,F1_ret
0,,
1,,
2,,
3,,-1.219152
4,-1.219152,-0.819441
...,...,...
324442,0.247615,-0.136132
324443,-0.136132,-0.819027
324444,-0.819027,-1.534790
324445,-1.534790,


In [8]:
#Earning per share:
data["EPSP"] = (data['net_income'] / data["sharesoutstanding"])/data["originalprice"]

#Book to Market Ratio
data["SAG"] = (data["revenue"] / (data["revenue"].shift(-4)))-1
data["SAG"] = data["SAG"].replace([np.inf, -np.inf], 0)

#Book to market ratio
data["B_MRatio"] = (data["totalassets"] - data["totalliabilities"]) / (data["originalprice"] * data["sharesoutstanding"])

#Short financial leverage
data["SFL"] = data["shortdebt"] / data["totalassets"]

#Long financial leverage
data["LFL"] = data["longdebt"] / data["totalassets"]

#To classify by size
data.sort_values("Market Value").groupby("q")
data["size"] = pd.cut(data["Market Value"], bins=3, labels = ["small","medium","big"])

d = pd.get_dummies(data["size"],drop_first=-1)
data[["Medium","Big"]] = d[["medium","big"]]

data = data.replace(np.nan, 0)
data

Unnamed: 0,firm,q,revenue,cogs,sgae,otheropexp,extraincome,finexp,incometax,totalassets,...,r,F1_ret,EPSP,SAG,B_MRatio,SFL,LFL,size,Medium,Big
0,A,2000q1,0.0,0.0,0.000000e+00,0.0,0.000000,0.000,0.0,0.000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,small,0,0
1,A,2000q2,2485000.0,1261000.0,1.010000e+06,0.0,42000.000000,0.000,90000.0,7321000.000,...,0.000000,0.000000,0.004977,0.043241,0.139169,0.069936,0.000000,small,0,0
2,A,2000q3,2670000.0,1369000.0,1.091000e+06,0.0,28000.000000,0.000,83000.0,7827000.000,...,0.000000,0.000000,0.006992,0.478405,0.221116,0.067459,0.000000,small,0,0
3,A,2000q4,3372000.0,1732000.0,1.182000e+06,0.0,10000.000000,0.000,163000.0,8425000.000,...,0.000000,-1.219152,0.012207,1.031325,0.210718,0.098516,0.000000,small,0,0
4,A,2001q1,2841000.0,1449000.0,1.113000e+06,0.0,-6000.000000,0.000,119000.0,9208000.000,...,-1.219152,-0.819441,0.010971,0.992286,0.394756,0.060382,0.000000,small,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324442,ZYNE,2021q3,0.0,0.0,1.021065e+04,0.0,-376.636750,-5.038,0.0,89996.170,...,0.247615,-0.136132,-0.060502,0.000000,0.449880,0.002307,0.004569,small,0,0
324443,ZYNE,2021q4,0.0,0.0,8.836436e+03,0.0,16.937906,-4.433,0.0,81171.507,...,-0.136132,-0.819027,-0.074259,0.000000,0.597384,0.002576,0.004357,small,0,0
324444,ZYNE,2022q1,0.0,0.0,8.903915e+03,0.0,317.252110,-96.044,0.0,74381.029,...,-0.819027,-1.534790,-0.097575,0.000000,0.748904,0.002830,0.003976,small,0,0
324445,ZYNE,2022q2,0.0,0.0,9.168770e+03,0.0,-775.927860,-91.691,0.0,67006.959,...,-1.534790,0.000000,-0.198252,0.000000,1.151762,0.003163,0.003543,small,0,0


In [None]:
#from scipy.stats.mstats import winsorize

#winsorize(data["SAG"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["Op_profitmargin"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["B_MRatio"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["SFL"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["LFL"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["EPSP"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')
#winsorize(data["F1_ret"], limits=[0.025, 0.025], inplace=True,nan_policy='propagate')

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

x = data[["Op_profitmargin","B_MRatio","SFL","LFL","EPSP","Medium","Big"]]
vif_data = pd.DataFrame()
vif_data["feature"] = x.columns



vif_data["VIF"] = [variance_inflation_factor(x.values,i)for i in range (len(x.columns))]
vif_data

Unnamed: 0,feature,VIF
0,Op_profitmargin,1.0
1,B_MRatio,95.027257
2,SFL,1.001169
3,LFL,1.001172
4,EPSP,95.027254
5,Medium,1.000001
6,Big,1.000001


In [10]:
import statsmodels.formula.api as sm
result = sm.ols(formula="F1_ret ~ B_MRatio + Op_profitmargin + SFL + EPSP + Medium + Big", data=data).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 F1_ret   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                  0.000
Method:                 Least Squares   F-statistic:                     2.839
Date:                Fri, 14 Oct 2022   Prob (F-statistic):            0.00916
Time:                        13:37:56   Log-Likelihood:            -1.5860e+05
No. Observations:              324447   AIC:                         3.172e+05
Df Residuals:                  324440   BIC:                         3.173e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0114      0.001     

GET THE COVARIANCE MATRIZ

In [None]:
x = data[["B_MRatio","Op_profitmargin","SFL","EPSP","Medium","Big"]]
x

In [None]:
x = x.to_numpy()
print(x.shape)
x

In [None]:
ones = np.ones([x.shape[0],1])
print(ones.shape)
ones

In [None]:
a = np.dot(x.transpose(),ones)
a

In [None]:
b = np.dot(a,a.transpose()) / x.shape[0]

In [None]:
xp_x = np.dot(x.transpose(),x)

In [None]:
c = xp_x - b

In [None]:
d = 1 / (x.shape[0] - 1)

In [None]:
e = d*c
e

In [None]:
x = data[["B_MRatio","Op_profitmargin","SFL","EPSP","Medium","Big"]]
x.cov()

Leverage

In [11]:
#dataI = data.loc[(data["Sector NAICS level 1"] == "Wholesale Trade") | (data["Sector NAICS level 1"] == "Retail Trade")]
#dataI 

Unnamed: 0,firm,q,revenue,cogs,sgae,otheropexp,extraincome,finexp,incometax,totalassets,...,r,F1_ret,EPSP,SAG,B_MRatio,SFL,LFL,size,Medium,Big
630,AAP,2000q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0,0
631,AAP,2000q2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0,0
632,AAP,2000q3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0,0
633,AAP,2000q4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0,0
634,AAP,2001q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323901,ZUMZ,2021q2,279069.0,175900.0,68889.0,0.0,254.0,-975.0,9124.0,1017804.0,...,0.581803,0.357106,0.020891,0.264552,0.460630,0.066253,0.234380,small,0,0
323902,ZUMZ,2021q3,268666.0,163701.0,73011.0,0.0,-151.0,-965.0,8770.0,1030742.0,...,0.357106,0.266038,0.023985,0.000000,0.593977,0.063880,0.216391,small,0,0
323903,ZUMZ,2021q4,289455.0,174791.0,74822.0,0.0,468.0,-893.0,10501.0,978189.0,...,0.266038,-0.115775,0.027842,0.000000,0.482215,0.065825,0.225441,small,0,0
323904,ZUMZ,2022q1,346677.0,212744.0,82198.0,0.0,-1462.0,-759.0,12828.0,862012.0,...,-0.115775,-0.633520,0.050187,0.000000,0.615212,0.073754,0.237014,small,0,0


In [19]:
df_mask = data['q'] == '2022q1'
dataSTL = data[df_mask]
#PREGUNTAR ESTOOOOOO:
#dataSTL = dataI.loc[(dataI.shape[0]-2)]
#dataL = dataI.groupby('firm').last() #penultimo
dataSTL

Unnamed: 0,firm,q,revenue,cogs,sgae,otheropexp,extraincome,finexp,incometax,totalassets,...,r,F1_ret,EPSP,SAG,B_MRatio,SFL,LFL,size,Medium,Big
88,A,2022q1,1674000.0,764000.0,5.340000e+05,0.0,-37000.00000,20000.000,36000.0,1.032700e+07,...,0.045405,-0.213296,0.007126,0.000000,0.129778,0.000000,0.264356,small,0,0
178,AA,2022q1,3293000.0,2181000.0,2.130000e+05,125000.0,-70000.00000,25000.000,210000.0,1.598800e+07,...,1.022496,0.217886,0.028098,0.000000,0.374854,0.000063,0.108019,small,0,0
268,AAIC,2022q1,8470.0,4773.0,0.000000e+00,0.0,-4111.00000,0.000,2287.0,9.208830e+05,...,-0.152090,-0.222528,-0.022229,-0.998389,1.794935,0.000000,0.178003,small,0,0
358,AAL,2022q1,8899000.0,0.0,1.062200e+07,0.0,92000.00000,455000.000,-451000.0,6.740100e+07,...,-0.269713,-0.514447,-0.138008,0.000000,-0.754610,0.035341,0.526120,small,0,0
448,AAME,2022q1,51608.0,0.0,4.781200e+04,0.0,0.00000,0.000,954.0,3.750310e+05,...,-0.156671,-0.475675,0.044556,0.000000,1.982019,0.000000,0.089961,small,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324084,ZVIA,2022q1,38034.0,23413.0,2.327500e+04,8901.0,6669.00000,0.000,12.0,1.164800e+05,...,0.000000,0.000000,-0.035760,0.000000,0.307988,0.005220,0.004155,small,0,0
324174,ZVO,2022q1,61633.0,39829.0,2.903600e+04,0.0,-127.00000,0.000,78.0,1.487510e+05,...,-1.599512,-0.982014,-0.266288,0.000000,0.403424,0.000000,0.000000,small,0,0
324264,ZWS,2022q1,239600.0,137700.0,5.690000e+04,1100.0,1100.00000,4800.000,10000.0,1.118600e+06,...,0.416636,0.093173,0.006782,0.000000,0.043502,0.005006,0.483014,small,0,0
324354,ZY,2022q1,4791.0,12455.0,5.608200e+04,-130.0,-532.00000,7994.000,-26.0,6.181890e+05,...,0.000000,-3.482115,-0.241984,0.000000,1.154503,0.093572,0.293062,small,0,0


In [20]:
x = dataSTL[["B_MRatio","Op_profitmargin","SFL","EPSP","Medium","Big"]]
ones = np.ones(x.shape[0])
x.insert(0,'ones',ones, True)
x_t = x.transpose()
print(x.shape)
print(x_t.shape)

(3607, 7)
(7, 3607)


In [21]:
x

Unnamed: 0,ones,B_MRatio,Op_profitmargin,SFL,EPSP,Medium,Big
88,1.0,0.129778,1.224612,0.000000,0.007126,0,0
178,1.0,0.374854,1.273003,0.000063,0.028098,0,0
268,1.0,1.794935,1.436482,0.000000,-0.022229,0,0
358,1.0,-0.754610,0.806383,0.035341,-0.138008,0,0
448,1.0,1.982019,1.073554,0.000000,0.044556,0,0
...,...,...,...,...,...,...,...
324084,1.0,0.307988,0.772467,0.005220,-0.035760,0,0
324174,1.0,0.403424,0.882660,0.000000,-0.266288,0,0
324264,1.0,0.043502,1.187813,0.005006,0.006782,0,0
324354,1.0,1.154503,-12.305364,0.093572,-0.241984,0,0


In [22]:
x_t

Unnamed: 0,88,178,268,358,448,538,628,718,808,898,...,323634,323724,323814,323904,323994,324084,324174,324264,324354,324444
ones,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
B_MRatio,0.129778,0.374854,1.794935,-0.75461,1.982019,2.386108,0.164018,0.0,0.023653,0.525842,...,0.150155,0.015845,0.052332,0.615212,0.088976,0.307988,0.403424,0.043502,1.154503,0.748904
Op_profitmargin,1.224612,1.273003,1.436482,0.806383,1.073554,0.727442,1.125884,0.0,1.308179,1.279314,...,0.0,0.67158,1.397281,1.149231,0.623032,0.772467,0.88266,1.187813,-12.305364,0.0
SFL,0.0,6.3e-05,0.0,0.035341,0.0,0.119301,0.0,0.0,0.047504,0.0,...,0.0,0.008119,0.097403,0.073754,0.029738,0.00522,0.0,0.005006,0.093572,0.00283
EPSP,0.007126,0.028098,-0.022229,-0.138008,0.044556,-0.159972,0.006169,0.0,0.008777,0.004656,...,-0.028102,-0.00295,0.006685,0.050187,-0.018348,-0.03576,-0.266288,0.006782,-0.241984,-0.097575
Medium,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Big,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
w =np.dot(x_t,x)
w

array([[ 3.60700000e+03,  1.89616594e+03, -2.80119242e+04,
         1.09668438e+02, -7.94787650e+01,  3.00000000e+00,
         2.00000000e+00],
       [ 1.89616594e+03,  2.26432646e+03, -7.94883944e+03,
         3.78495629e+01, -5.12307419e+01,  2.50326854e-01,
         9.41410139e-02],
       [-2.80119242e+04, -7.94883944e+03,  7.91988117e+07,
        -5.60219209e+03,  2.40469376e+03,  3.52119781e+00,
         2.72073940e+00],
       [ 1.09668438e+02,  3.78495629e+01, -5.60219209e+03,
         4.51414903e+01, -5.74456099e+00,  2.51218995e-02,
         5.25797812e-02],
       [-7.94787650e+01, -5.12307419e+01,  2.40469376e+03,
        -5.74456099e+00,  5.42586773e+01,  9.60232888e-03,
         1.60141636e-02],
       [ 3.00000000e+00,  2.50326854e-01,  3.52119781e+00,
         2.51218995e-02,  9.60232888e-03,  3.00000000e+00,
         0.00000000e+00],
       [ 2.00000000e+00,  9.41410139e-02,  2.72073940e+00,
         5.25797812e-02,  1.60141636e-02,  0.00000000e+00,
         2.0000000

In [24]:
np.linalg.det(w)

4.642154161407123e+18

In [25]:
a = np.linalg.inv(w)
print(a.shape)

(7, 7)


In [26]:
b = np.dot(x,a)
print(b.shape)

(3607, 7)


In [27]:
hat = np.dot(b,x_t)
hat

array([[ 4.44967268e-04,  3.74948895e-04, -1.06468766e-04, ...,
         4.68522524e-04, -7.60327956e-05,  1.95959656e-04],
       [ 3.74948895e-04,  3.62995800e-04,  1.41714866e-04, ...,
         3.81836335e-04, -4.04403463e-05,  2.03019092e-04],
       [-1.06468766e-04,  1.41714866e-04,  1.55240427e-03, ...,
        -1.93917485e-04,  8.54824662e-04,  4.94618409e-04],
       ...,
       [ 4.68522524e-04,  3.81836335e-04, -1.93917485e-04, ...,
         4.98295187e-04, -1.09615125e-04,  1.79723862e-04],
       [-7.60327956e-05, -4.04403463e-05,  8.54824662e-04, ...,
        -1.09615125e-04,  1.55693484e-03,  6.50401056e-04],
       [ 1.95959656e-04,  2.03019092e-04,  4.94618409e-04, ...,
         1.79723862e-04,  6.50401056e-04,  4.41601683e-04]])

In [28]:
hat.diagonal().sum()

7.0

In [44]:
hat_d = hat.diagonal()
hat_d

array([0.00044497, 0.000363  , 0.0015524 , ..., 0.0004983 , 0.00155693,
       0.0004416 ])

In [37]:
hat_mean = (hat_d.sum())/hat.shape[0]
hat_mean

0.0019406709176601053

In [42]:
mask = hat_d > (3*hat_mean)
high_leverage = hat_d[mask]
high_leverage

array([0.50001108, 0.04629149, 0.01564205, 0.00796085, 0.02016852,
       0.01118322, 0.006675  , 0.01118763, 0.00750478, 0.02207944,
       0.00679319, 0.3333358 , 0.00732216, 0.0077295 , 0.04998853,
       0.01628761, 0.01389353, 0.00696007, 0.00937006, 0.00805004,
       0.21223925, 0.03493159, 0.01295984, 0.0078905 , 0.02217546,
       0.01101327, 0.22059799, 0.00732819, 0.01123522, 0.01036875,
       0.01005652, 0.27075579, 0.00705369, 0.06557237, 0.0076362 ,
       0.00652244, 0.00923787, 0.05234085, 0.04389426, 0.33333766,
       0.00986641, 0.00817756, 0.00923692, 0.00756448, 0.00975682,
       0.59694379, 0.03399465, 0.00703736, 0.01263821, 0.04365969,
       0.0075326 , 0.01158709, 0.50001108, 0.00709522, 0.0059081 ,
       0.01508059, 0.18601671, 0.02720846, 0.01205781, 0.01668877,
       0.00838278, 0.00633862, 0.01191185, 0.02340595, 0.00867054,
       0.03308635, 0.0375898 , 0.00684746, 0.00960248, 0.00626869,
       0.00861815, 0.09058254, 0.00918054, 0.33334166, 0.01304

In [41]:
high_leverage.shape

(84,)

In [32]:
y = dataSTL[["F1_ret"]]
x.insert(7,'F1_ret',y, True)
x

Unnamed: 0,ones,B_MRatio,Op_profitmargin,SFL,EPSP,Medium,Big,F1_ret
88,1.0,0.129778,1.224612,0.000000,0.007126,0,0,-0.213296
178,1.0,0.374854,1.273003,0.000063,0.028098,0,0,0.217886
268,1.0,1.794935,1.436482,0.000000,-0.022229,0,0,-0.222528
358,1.0,-0.754610,0.806383,0.035341,-0.138008,0,0,-0.514447
448,1.0,1.982019,1.073554,0.000000,0.044556,0,0,-0.475675
...,...,...,...,...,...,...,...,...
324084,1.0,0.307988,0.772467,0.005220,-0.035760,0,0,0.000000
324174,1.0,0.403424,0.882660,0.000000,-0.266288,0,0,-0.982014
324264,1.0,0.043502,1.187813,0.005006,0.006782,0,0,0.093173
324354,1.0,1.154503,-12.305364,0.093572,-0.241984,0,0,-3.482115


In [33]:
import statsmodels.formula.api as sm
result = sm.ols(formula="F1_ret ~ B_MRatio + Op_profitmargin + SFL + EPSP + Medium + Big", data=x).fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                 F1_ret   R-squared:                       0.212
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     161.4
Date:                Fri, 14 Oct 2022   Prob (F-statistic):          4.75e-182
Time:                        13:50:49   Log-Likelihood:                -3302.1
No. Observations:                3607   AIC:                             6618.
Df Residuals:                    3600   BIC:                             6661.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept          -0.3405      0.014    -

In [35]:
from statsmodels.stats.outliers_influence import OLSInfluence as olsi
leverage_pts = olsi(result).hat_matrix_diag
leverage_pts

array([0.00044497, 0.000363  , 0.0015524 , ..., 0.0004983 , 0.00155693,
       0.0004416 ])

In [45]:
mask = leverage_pts > (3*hat_mean)
high_leverage1 = leverage_pts[mask]
high_leverage1

array([0.50001108, 0.04629149, 0.01564205, 0.00796085, 0.02016852,
       0.01118322, 0.006675  , 0.01118763, 0.00750478, 0.02207944,
       0.00679319, 0.3333358 , 0.00732216, 0.0077295 , 0.04998853,
       0.01628761, 0.01389353, 0.00696007, 0.00937006, 0.00805004,
       0.21223925, 0.03493159, 0.01295984, 0.0078905 , 0.02217546,
       0.01101327, 0.22059799, 0.00732819, 0.01123522, 0.01036875,
       0.01005652, 0.27075579, 0.00705369, 0.06557237, 0.0076362 ,
       0.00652244, 0.00923787, 0.05234085, 0.04389426, 0.33333766,
       0.00986641, 0.00817756, 0.00923692, 0.00756448, 0.00975682,
       0.59694379, 0.03399465, 0.00703736, 0.01263821, 0.04365969,
       0.0075326 , 0.01158709, 0.50001108, 0.00709522, 0.0059081 ,
       0.01508059, 0.18601671, 0.02720846, 0.01205781, 0.01668877,
       0.00838278, 0.00633862, 0.01191185, 0.02340595, 0.00867054,
       0.03308635, 0.0375898 , 0.00684746, 0.00960248, 0.00626869,
       0.00861815, 0.09058254, 0.00918054, 0.33334166, 0.01304

Outliers

In [110]:
y = dataSTL[["F1_ret"]]
y = pd.DataFrame(y)
y['pred'] = np.dot(hat,y['F1_ret'])
y

Unnamed: 0,F1_ret,pred
88,-0.213296,-0.333568
178,0.217886,-0.301955
268,-0.222528,-0.548925
358,-0.514447,-0.632480
448,-0.475675,-0.397707
...,...,...
324084,0.000000,-0.457799
324174,-0.982014,-1.041509
324264,0.093173,-0.328616
324354,-3.482115,-1.075339


In [111]:
mse = (y['F1_ret']-y['pred'])**2
mse = mse.sum()/y.shape[0]
mse


0.36533214767257327

In [112]:
y['ei'] = y['F1_ret'] - y['pred']
y['H_ii'] = hat_d
y

Unnamed: 0,F1_ret,pred,ei,H_ii
88,-0.213296,-0.333568,0.120272,0.000445
178,0.217886,-0.301955,0.519841,0.000363
268,-0.222528,-0.548925,0.326397,0.001552
358,-0.514447,-0.632480,0.118034,0.001890
448,-0.475675,-0.397707,-0.077968,0.002065
...,...,...,...,...
324084,0.000000,-0.457799,0.457799,0.000342
324174,-0.982014,-1.041509,0.059495,0.001495
324264,0.093173,-0.328616,0.421789,0.000498
324354,-3.482115,-1.075339,-2.406776,0.001557


In [118]:
y['ri'] = y['ei'] / np.sqrt(mse*(1-y['H_ii']))
y['ri_abs'] = y['ri'].abs()
y

Unnamed: 0,F1_ret,pred,ei,H_ii,ri,ri_abs
88,-0.213296,-0.333568,0.120272,0.000445,0.199030,0.199030
178,0.217886,-0.301955,0.519841,0.000363,0.860211,0.860211
268,-0.222528,-0.548925,0.326397,0.001552,0.540431,0.540431
358,-0.514447,-0.632480,0.118034,0.001890,0.195467,0.195467
448,-0.475675,-0.397707,-0.077968,0.002065,-0.129128,0.129128
...,...,...,...,...,...,...
324084,0.000000,-0.457799,0.457799,0.000342,0.757539,0.757539
324174,-0.982014,-1.041509,0.059495,0.001495,0.098506,0.098506
324264,0.093173,-0.328616,0.421789,0.000498,0.698006,0.698006
324354,-3.482115,-1.075339,-2.406776,0.001557,-3.985016,3.985016


In [120]:
outliers = y[y['ri_abs'] > 3]
outliers['ri_abs']

6202       3.323755
7282       4.184912
8452       3.811324
14752      3.153724
17812      3.314271
31312      3.472196
33202      4.395365
35182      3.011245
35992      3.218542
44999      3.030216
47789      4.921294
56789     14.212946
64079      3.467789
72661      4.126033
78151      3.482046
81661      3.774126
86611      3.146242
107041     4.066487
111181     3.569021
113521     4.219924
117841     3.004625
118561     3.385322
122431     3.436082
125040     3.265282
125760     3.821375
128100     3.589923
142134     3.847252
142224     4.843529
143844     5.790955
161114     4.806181
207004     3.147497
217443     3.595371
235080     3.351871
235440     3.348519
244980     3.050982
250200     4.498258
254250     5.563454
258660     3.384886
262890     3.610752
269640     3.778725
272790     4.325927
280979     3.332421
286289     3.460106
288084     3.523311
324354     3.985016
Name: ri_abs, dtype: float64

In [121]:
test = result.outlier_test()
test

Unnamed: 0,student_resid,unadj_p,bonf(p)
88,0.198810,0.842422,1.000000
178,0.859345,0.390207,1.000000
268,0.539853,0.589332,1.000000
358,0.195251,0.845208,1.000000
448,-0.128985,0.897377,1.000000
...,...,...,...
324084,0.756758,0.449244,1.000000
324174,0.098397,0.921622,1.000000
324264,0.697279,0.485673,1.000000
324354,-3.989386,0.000068,0.243699
