In [1]:
import numpy as np
from numpy.linalg import inv
from statsmodels.api import add_constant

In [2]:
XTX = np.array([
    [100, 123,  96, 109],
    [123, 252, 125, 189],
    [ 96, 125, 167, 146],
    [109, 189, 146, 168]
])

XTy = np.array([460, 810, 615, 712]).reshape(-1,1)

yTy = 3924

In [None]:
# 1a
M = np.array([
    [252, 125, 189],
    [125, 167, 146],
    [189, 146, 168]
])

std_devs = np.sqrt(np.diag(M))

correlation_matrix = M / np.outer(std_devs, std_devs)
print(np.round(correlation_matrix, 4))

[[1.     0.6093 0.9186]
 [0.6093 1.     0.8716]
 [0.9186 0.8716 1.    ]]


In [None]:
#1b
print(inv(XTX) @ XTy)

[[-0.40220662]
 [ 6.12337011]
 [ 5.90972919]
 [-7.52557673]]


In [8]:
# 1c
XTX_1 = np.delete(np.delete(XTX, 3, 0), 3, 1)
XTy_1 = np.delete(XTy,3,0)
print(inv(XTX_1) @ XTy_1)

[[-0.22635778]
 [ 2.2800681 ]
 [ 2.10611877]]


In [9]:
XTX_2 = np.delete(np.delete(XTX, 2, 0), 2, 1)
XTy_2 = np.delete(XTy,2,0)
print(inv(XTX_2) @ XTy_2)

[[-0.06961614]
 [ 0.22923444]
 [ 4.02537411]]


In [10]:
XTX_3 = np.delete(np.delete(XTX, 1, 0), 1, 1)
XTy_3 = np.delete(XTy,1,0)
print(inv(XTX_3) @ XTy_3)

[[-0.06268743]
 [-0.09177817]
 [ 4.35852704]]


In [11]:
# 1d
XTX_no_const = np.delete(np.delete(XTX, 0, 0), 0, 1)

stds = np.sqrt(np.diag(XTX_no_const))


R = XTX_no_const / np.outer(stds, stds)

R_inv = np.linalg.inv(R)
VIFs = np.diag(R_inv)

for i, vif in enumerate(VIFs, start=1):
    print(f"VIF for x_{i}: {vif:.2f}")

VIF for x_1: 258.40
VIF for x_2: 168.07
VIF for x_3: 676.27


In [None]:
#2
import numpy as np
from scipy import stats


e_e = 520
n = 29
k = 3 
XTX = np.array([[29, 0, 0],
                [0, 50, 10],
                [0, 10, 80]])
beta_hat = np.array([4, 0.4, 0.9])


sigma_squared = e_e / (n - k)


XTX_inv = np.linalg.inv(XTX)

R = np.array([[0, 1, 1]])
r = 1


Rb_minus_r = R @ beta_hat - r
denominator = R @ XTX_inv @ R.T * sigma_squared
F_stat = (Rb_minus_r ** 2) / denominator


df1 = 1  
df2 = n - k


p_value = 1 - stats.f.cdf(F_stat, df1, df2)

print("F-statistic:", F_stat[0][0])
print("p-value:", p_value)


F-statistic: 0.1595454545454546
p-value: [[0.69283761]]


In [24]:
#2
import polars as pl 
import statsmodels.api as sm

from scipy.stats import chi2


In [14]:

df1 = pl.read_excel("data/timeinvar.xlsx")
df2 = pl.read_excel("data/timevar.xlsx")
df = df1.join(df2, on="id", how="left",validate="m:m")

In [None]:
#1a
X_1 = df[["edu", "exper", "ability"]].to_numpy()
X_1 = sm.add_constant(X_1)
y  = df[["lwage"]].to_numpy()
res1 = sm.OLS(y, X_1).fit()
print(res1.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.173
Model:                            OLS   Adj. R-squared:                  0.173
Method:                 Least Squares   F-statistic:                     1253.
Date:                Mon, 30 Jun 2025   Prob (F-statistic):               0.00
Time:                        19:58:41   Log-Likelihood:                -12283.
No. Observations:               17919   AIC:                         2.457e+04
Df Residuals:                   17915   BIC:                         2.460e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0272      0.030     34.194      0.0

In [17]:
X_2 = df[["meduc", "feduc", "brokenhome", "siblings"]].to_numpy()
X_2 = sm.add_constant(X_2)
y  = df[["lwage"]].to_numpy()
res2 = sm.OLS(y, X_2).fit()
print(res2.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.027
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     123.2
Date:                Mon, 30 Jun 2025   Prob (F-statistic):          6.81e-104
Time:                        19:58:41   Log-Likelihood:                -13746.
No. Observations:               17919   AIC:                         2.750e+04
Df Residuals:                   17914   BIC:                         2.754e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0119      0.019    104.391      0.0

In [18]:
#3b
A = np.identity(len(res1.params))
A = A[1:,:]
A

f_test_result = res1.f_test(A)
print(f_test_result)

<F test: F=1252.9441313488094, p=0.0, df_denom=1.79e+04, df_num=3>


In [19]:
#3c
A = np.identity(len(res2.params))
A = A[1:,:]
A

f_test_result = res2.f_test(A)
print(f_test_result)

<F test: F=123.18097459989582, p=6.806967723094877e-104, df_denom=1.79e+04, df_num=4>


In [20]:
# 3d
res2.wald_test(A)



<class 'statsmodels.stats.contrast.ContrastResults'>
<F test: F=array([[123.1809746]]), p=6.806967723094877e-104, df_denom=1.79e+04, df_num=4>

In [21]:
# 4a
df = pl.read_excel("data/Nerlove1963.xlsx").to_pandas()
for col in df.columns:
    df[f"{col}_log"] = np.log(df[col])
X = df[["output_log", "Plabor_log", "Pcapital_log", "Pfuel_log"]]
X = sm.add_constant(X)
y = df["Cost_log"]

model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Cost_log   R-squared:                       0.926
Model:                            OLS   Adj. R-squared:                  0.924
Method:                 Least Squares   F-statistic:                     437.7
Date:                Mon, 30 Jun 2025   Prob (F-statistic):           4.82e-78
Time:                        19:58:41   Log-Likelihood:                -67.542
No. Observations:                 145   AIC:                             145.1
Df Residuals:                     140   BIC:                             160.0
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -3.5265      1.774     -1.987   

In [None]:
#4c
glm_model = sm.GLM(y, X, family=sm.families.Gaussian())
constraint = 'Plabor_log + Pcapital_log + Pfuel_log = 1'

model_constrained = glm_model.fit_constrained(constraint)
print("\nConstrained GLM (Gaussian) results:")
print(model_constrained.summary())


Constrained GLM (Gaussian) results:
                 Generalized Linear Model Regression Results                  
Dep. Variable:               Cost_log   No. Observations:                  145
Model:                            GLM   Df Residuals:                      141
Model Family:                Gaussian   Df Model:                            3
Link Function:               Identity   Scale:                         0.15348
Method:                          IRLS   Log-Likelihood:                -67.838
Date:                Mon, 30 Jun 2025   Deviance:                       21.640
Time:                        20:00:53   Pearson chi2:                     21.6
No. Iterations:                     1   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const      

In [None]:
# 4d
R = np.array([[0, 0, 1, 1, 1]])
q = np.array([1]) 


beta_hat = model_constrained.params.values
cov_beta = model_constrained.cov_params().values

W = (R @ beta_hat - q) @ np.linalg.inv(R @ cov_beta @ R.T) @ (R @ beta_hat - q)
p_value = 1 - chi2.cdf(W, df=1)

print(f"Wald statistic: {W:.4f}")
print(f"p-value: {p_value:.4f}")

Wald statistic: 0.0000
p-value: 1.0000
