# Regression Analysis of Temporal Processes (Class 5) - Random Effects, Fixed Effects with interaction 

## 1. Random effects

First we import the modules that we need and set up the data we used in the previous class

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import numpy.linalg as la

In [2]:
pan = pd.read_csv('panel-for-R.csv')

variables = ["tax", "idnum", "panelwave", "age", "sex", "realinc", "race", "region", "coop", "happy", "wrkstat", "marital"]

sub = pan[variables].copy()

In [3]:
sub['toohigh'] = 4 - sub['tax']

sub = sub.set_index(['idnum', 'panelwave'], drop=False)

sub['logrealinc'] = np.log(sub.loc[:, 'realinc'])

In [4]:
# slide 14
from linearmodels.panel import RandomEffects

re_tax = RandomEffects.from_formula('toohigh ~ 1 + logrealinc + C(panelwave)', data = sub).fit()
re_tax

Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,toohigh,R-squared:,0.1827
Estimator:,RandomEffects,R-squared (Between):,0.0059
No. Observations:,2856,R-squared (Within):,0.0056
Date:,"Thu, Aug 16 2018",R-squared (Overall):,0.0054
Time:,11:57:50,Log-likelihood,-1346.3
Cov. Estimator:,Unadjusted,,
,,F-statistic:,212.57
Entities:,1272,P-value,0.0000
Avg Obs:,2.2453,Distribution:,"F(3,2852)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,2.3162,0.1010,22.926,0.0000,2.1181,2.5142
C(panelwave)[T.2.0],-0.0162,0.0179,-0.9091,0.3634,-0.0513,0.0188
C(panelwave)[T.3.0],-0.0569,0.0191,-2.9845,0.0029,-0.0943,-0.0195
logrealinc,0.0254,0.0100,2.5495,0.0108,0.0059,0.0450


In [5]:
# idiosyncratic and individual effects variances
re_tax.variance_decomposition

Effects                   0.120515
Residual                  0.150702
Percent due to Effects    0.444349
Name: Variance Decomposition, dtype: float64

In [24]:
re_tax.theta.describe()

Unnamed: 0,theta
count,1272.0
mean,0.386091
std,0.08498
min,0.254418
25%,0.254418
50%,0.457413
75%,0.457413
max,0.457413


In [66]:
# slide 15

print('Total sum of squares: ', re_tax.total_ss)
print('Residual sum of squares: ', re_tax.resid_ss)

Total sum of squares:  525.255225596566
Residual sum of squares:  429.27118037383093


R-squared, F-statistics, and p-value can be seen from the summary table printed above. 

In [9]:
# slide 16

print('sigma_u =', round(np.sqrt(re_tax.variance_decomposition[0]), 5))
print('sigma_e =', round(np.sqrt(re_tax.variance_decomposition[1]), 5))
print('rho =', round(re_tax.variance_decomposition[2], 5))

sigma_u = 0.34715
sigma_e = 0.3882
rho = 0.44435


(Slide 19) Since we did not run the fe_tax model in the last class because it was for teaching purposes only, we are skipping the Hausman test as well. 

For reference, this test can be performed using the following function borrowed from https://github.com/sglyon/econtools/blob/master/Python/econtools/metrics.py  
(The function had to be revised slightly because `statsmodels.regression.linear_panel.PanelLMRandomResults` has been deprecated.)

In [27]:
def hausman(fe, re):
    """
    Compute hausman test for fixed effects/random effects models
    b = beta_fe
    B = beta_re
    From theory we have that b is always consistent, but B is consistent
    under the alternative hypothesis and efficient under the null.
    The test statistic is computed as
    z = (b - B)' [V_b - v_B^{-1}](b - B)
    The statistic is distributed z \sim \chi^2(k), where k is the number
    of regressors in the model.
    Parameters
    ==========
    fe : statsmodels.regression.linear_panel.PanelLMWithinResults
        The results obtained by using sm.PanelLM with the
        method='within' option.
    re : statsmodels.regression.linear_panel.PanelLMRandomResults
        The results obtained by using sm.PanelLM with the
        method='swar' option.
    Returns
    =======
    chi2 : float
        The test statistic
    df : int
        The number of degrees of freedom for the distribution of the
        test statistic
    pval : float
        The p-value associated with the null hypothesis
    Notes
    =====
    The null hypothesis supports the claim that the random effects
    estimator is "better". If we reject this hypothesis it is the same
    as saying we should be using fixed effects because there are
    systematic differences in the coefficients.
    """

    # Pull data out
    b = fe.params.values
    B = re.params.values
    v_b = fe.cov.values
    v_B = re.cov.values

    # NOTE: find df. fe should toss time-invariant variables, but it
    #       doesn't. It does return garbage so we use that to filter
    df = b[np.abs(b) < 1e8].size

    # compute test statistic and associated p-value
    chi2 = np.dot((b - B).T, la.inv(v_b - v_B).dot(b - B))
    pval = stats.chi2.sf(chi2, df)

    return chi2, df, pval

## 2. Fixed effects with interactions

(slide 26)

In [29]:
variables = ['tax', 'coop', 'realinc', 'educ', 'panelwave', 'idnum', 'race', 'partyid', 'polviews']
fsub_int = pan[variables].copy()

fsub_int['race2'] = np.where(pd.isnull(fsub_int.race), np.nan, 
                             np.where(fsub_int.race==2, 1, 0))
fsub_int['race3'] = np.where(pd.isnull(fsub_int.race), np.nan, 
                             np.where(fsub_int.race == 3, 1, 0))

fsub_int = fsub_int.rename(columns = {'polviews': 'conservative'})

In [30]:
#To perform any panel data analysis, we need to convert this dataframe to a multiindex dataframe:
fsub_int = fsub_int.set_index(['idnum', 'panelwave'], drop=False)

# create first-differenced variables
fsub_int['d_race2'] = fsub_int['race2'].groupby(level = 0).diff()
fsub_int['d_race3'] = fsub_int['race3'].groupby(level = 0).diff()

# rename columns
fsub_int = fsub_int.rename(columns = {'race2': 'black'})
fsub_int = fsub_int.rename(columns = {'race3': 'other'})

### Fixed effects without interactions
(slide 28)

In [68]:
from linearmodels.panel import PanelOLS

# Subset the dataframe to only include people who did not change their race
fsub_int2 = fsub_int[(fsub_int.d_race2 == 0) & (fsub_int.d_race3 == 0)].copy()

# add `EntityEffects` to indicate one-way fixed effects in the model
fe_race_int1 = PanelOLS.from_formula("conservative ~ black + other + panelwave + EntityEffects", data = fsub_int2).fit()
fe_race_int1

Inputs contain missing values. Dropping rows with missing observations.


AbsorbingEffectError: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.


`linearmodels` cannot fit this model with `black` and `other` as predictors because each of them perfectly explains the dependent variable. The `plm` function R automatically removed these two variables. We can take them out to produce the same results as `plm`:

In [69]:
fe_race_int1 = PanelOLS.from_formula("conservative ~ panelwave + EntityEffects", data = fsub_int2).fit()
fe_race_int1

Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,conservative,R-squared:,0.0049
Estimator:,PanelOLS,R-squared (Between):,0.0907
No. Observations:,2598,R-squared (Within):,0.0049
Date:,"Mon, Aug 13 2018",R-squared (Overall):,0.0900
Time:,22:52:44,Log-likelihood,-2223.3
Cov. Estimator:,Unadjusted,,
,,F-statistic:,5.5802
Entities:,1472,P-value,0.0183
Avg Obs:,1.7649,Distribution:,"F(1,1125)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
panelwave,0.0861,0.0365,2.3622,0.0183,0.0146,0.1577


### Fixed effects with interactions
(slide 30)

In [75]:
fe_race_int2 = PanelOLS.from_formula('conservative ~ black*panelwave + other*panelwave + EntityEffects',
                                    data = fsub_int2).fit()
fe_race_int2

Inputs contain missing values. Dropping rows with missing observations.


AbsorbingEffectError: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.


Same issue as above. We need to adjust the formula to produce the same results as `plm`.

Note that specifying an interaction term with `*`  means that the interaction term and individual terms are in the model. For example, with `black*panelwave`, the model will try to fit `black + panelwave + black:panelwave`. As we've seen above, the `black` variable explains the dependent variable perfectly so `linearmodel` will not fit this model. Instead, we will use `:` to tell `linearmodels` that we just want to fit the interaction term. 

In [76]:
fe_race_int2 = PanelOLS.from_formula('conservative ~ panelwave + black:panelwave + other:panelwave + EntityEffects',
                                    data = fsub_int2).fit()
fe_race_int2

Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,conservative,R-squared:,0.0152
Estimator:,PanelOLS,R-squared (Between):,0.0911
No. Observations:,2598,R-squared (Within):,0.0152
Date:,"Mon, Aug 13 2018",R-squared (Overall):,0.0893
Time:,22:56:54,Log-likelihood,-2209.8
Cov. Estimator:,Unadjusted,,
,,F-statistic:,5.7926
Entities:,1472,P-value,0.0006
Avg Obs:,1.7649,Distribution:,"F(3,1123)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
panelwave,0.1213,0.0403,3.0117,0.0027,0.0423,0.2003
black:panelwave,-0.3291,0.1061,-3.1011,0.0020,-0.5373,-0.1209
other:panelwave,0.1945,0.1663,1.1692,0.2426,-0.1319,0.5208


### Looking at pooled OLS 
(slide 35)

In [78]:
from linearmodels.panel import PooledOLS
PooledOLS.from_formula('conservative ~ 1 + black + other + panelwave', data = fsub_int2).fit()

Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,conservative,R-squared:,0.0123
Estimator:,PooledOLS,R-squared (Between):,0.0116
No. Observations:,2598,R-squared (Within):,0.0049
Date:,"Mon, Aug 13 2018",R-squared (Overall):,0.0123
Time:,23:02:21,Log-likelihood,-4604.5
Cov. Estimator:,Unadjusted,,
,,F-statistic:,10.729
Entities:,1472,P-value,0.0000
Avg Obs:,1.7649,Distribution:,"F(3,2594)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,3.9393,0.1417,27.794,0.0000,3.6614,4.2172
black,-0.3618,0.0817,-4.4265,0.0000,-0.5221,-0.2015
other,-0.4105,0.1163,-3.5306,0.0004,-0.6385,-0.1825
panelwave,0.0929,0.0561,1.6565,0.0977,-0.0171,0.2030


In [80]:
PooledOLS.from_formula('conservative ~ 1 + black*panelwave + other*panelwave', data = fsub_int2).fit()

Inputs contain missing values. Dropping rows with missing observations.


0,1,2,3
Dep. Variable:,conservative,R-squared:,0.0137
Estimator:,PooledOLS,R-squared (Between):,0.0117
No. Observations:,2598,R-squared (Within):,0.0140
Date:,"Mon, Aug 13 2018",R-squared (Overall):,0.0137
Time:,23:03:51,Log-likelihood,-4602.6
Cov. Estimator:,Unadjusted,,
,,F-statistic:,7.2133
Entities:,1472,P-value,0.0000
Avg Obs:,1.7649,Distribution:,"F(5,2592)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Intercept,3.8319,0.1574,24.338,0.0000,3.5232,4.1406
black,0.4253,0.4116,1.0335,0.3015,-0.3817,1.2324
panelwave,0.1365,0.0626,2.1797,0.0294,0.0137,0.2593
black:panelwave,-0.3198,0.1639,-1.9513,0.0511,-0.6412,0.0016
other,-0.4193,0.5815,-0.7211,0.4709,-1.5596,0.7210
other:panelwave,0.0046,0.2357,0.0194,0.9845,-0.4577,0.4668
