In [284]:
import numpy as np
import pandas as pd
from scipy.stats import bartlett, chi2, f, t

from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
init_notebook_mode(connected=True)

In [244]:
df = pd.read_csv('IB_statistics_data_hw7.txt', sep=' ')
df.head(3)

Unnamed: 0,ex1y,ex3x_1,ex1x_2,ex1x_3,Ex2.y,Ex2.x1,Ex2.x2,Ex4.y,Ex4.x1,Ex4.x2
0,12.180411,2.257498,3.851493,-0.14119,-230.183967,0.196763,33.271065,39.551743,5.137313,1.190741
1,3.066805,0.961697,0.86658,-0.364919,-252.509445,-0.540378,33.964212,31.612666,3.550462,1.151934
2,28.187584,0.526848,12.228611,-1.254557,-190.689938,0.860635,29.805329,35.671452,3.884775,1.235244


### Task 1

**Случай 1.** $cov(\varepsilon) = \sigma^2 \mathbb{1}$

Здесь можем воспользоваться обычными МНК-оценками для $\beta$ и $\sigma^2$

In [238]:
X = df[['ex3x_1', 'ex1x_2', 'ex1x_3']].values
y = df['ex1y'].values
X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1) # shape: [n, m_features+1]
b0, b1, b2, b3 = np.dot(np.dot(np.linalg.inv(np.dot(X_.T, X_)), X_.T), y)
beta = np.array([b0, b1, b2, b3])
y_pred1 = np.dot(X_, beta)
errors = y - y_pred1
sigma_squared = np.dot(errors.T, errors)/(X_.shape[0] - X_.shape[1])
print(f"Estimated beta: {beta}")
print(f"Estimated sigma-squared: {sigma_squared}")

Estimated beta: [ 0.52532792  1.05335225  1.99097775 -2.04502139]
Estimated sigma-squared: 1.0300435906917624


**Случай 2.**
$$ cov(\varepsilon) = \sigma^2
\begin{pmatrix}
    w_1 & \dots & 0 \\
    \vdots & \ddots & \vdots \\
    0 & \dots & w_n
\end{pmatrix}, w_{i} = 1 + \frac{i}{n}$$

Вернемся в условия обычной линейной регресси, разделим каждое уравнение на 1 + $\frac{i}{n}$.

Далее воспользуемся обычными МНК-оценками для $\beta$ и $\sigma^2$

In [240]:
X = df[['ex3x_1', 'ex1x_2', 'ex1x_3']].copy().values
y = df['ex1y'].copy().values
n = X.shape[0]
scaler = 1 + (np.arange(n) + 1)/n # TODO: check this
X[:, 0] = X[:, 0] / scaler
X[:, 1] = X[:, 1] / scaler
X[:, 2] = X[:, 2] / scaler
y = y / scaler
X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1) # shape: [n, m_features+1]
b0, b1, b2, b3 = np.dot(np.dot(np.linalg.inv(np.dot(X_.T, X_)), X_.T), y)
beta = np.array([b0, b1, b2, b3])
y_pred2 = np.dot(X_, beta)
errors = y - y_pred2
sigma_squared = np.dot(errors.T, errors)/(X_.shape[0] - X_.shape[1])
print(f"Estimated beta: {beta}")
print(f"Estimated sigma-squared: {sigma_squared}")

Estimated beta: [ 0.29560178  1.05689783  2.02261105 -2.04489143]
Estimated sigma-squared: 0.48910507383739454


**Случай 3.**
Дисперсии ошибок в первой трети наблюдений равны $w_{1}$, во второй трети - $w_{2}$, остальные - $w_{3}$.

Решение:  проводим обычную процедуру оценивания вектора ошибок, разбиваем его на три подвектора, строим оценки $w_{i}$, нормируем на них переменные и возвращаемся в условия обычной линейной регрессии.

Далее используем обычные МНК-оценками для $\beta$ и $\sigma^2$

In [241]:
# Оцениваем w_i
X = df[['ex3x_1', 'ex1x_2', 'ex1x_3']].copy().values
y = df['ex1y'].copy().values
lr = LinearRegression().fit(X, y)
errors = y - lr.predict(X)
n = X.shape[0]
slide = int(n / 3)
errors1 = errors[0:slide]
errors2 = errors[slide:slide*2]
errors3 = errors[slide*2:]
w1 = 1/slide * np.dot(errors1.T, errors1)
w2 = 1/slide * np.dot(errors2.T, errors2)
w3 = 1/slide * np.dot(errors3.T, errors3)
print(f"Estimated W: {[w1, w2, w3]}")

X[0:slide, :] = X[0:slide, :] / w1
X[slide:slide*2, :] = X[slide:slide*2, :] / w2
X[slide*2, :] = X[slide*2, :] / w3

y[0:slide] = y[0:slide] / w1
y[slide:slide*2] = y[slide:slide*2] / w2
y[slide*2] = y[slide*2] / w3

# # Оцениваем beta
X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1) # shape: [n, m_features+1]
b0, b1, b2, b3 = np.dot(np.dot(np.linalg.inv(np.dot(X_.T, X_)), X_.T), y)
beta = np.array([b0, b1, b2, b3])
y_pred3 = np.dot(X_, beta)
errors = y - y_pred3
sigma_squared = np.dot(errors.T, errors)/(X_.shape[0] - X_.shape[1])
print(f"Estimated beta: {beta}")
print(f"Estimated sigma-squared: {sigma_squared}")

Estimated W: [0.8200154087008855, 1.1811576244285236, 1.0048725478689997]
Estimated beta: [ 0.56781695  1.0513797   2.00580407 -2.05095773]
Estimated sigma-squared: 1.0590226585019369


**Случай 4.** Дисперсия ошибки пропорциональна третьему признаку.

Здесь просто нормируем уравнения на третий признак и переходим к обычной линейной регрессии.

In [255]:
X = df[['ex3x_1', 'ex1x_2', 'ex1x_3']].copy().values
y = df['ex1y'].copy().values

X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1) # shape: [n, m_features+1]
y = y / X_[:, 3]
X_[:, 0] = X_[:, 0] / X_[:, 3]
X_[:, 1] = X_[:, 1] / X_[:, 3]
X_[:, 2] = X_[:, 2] / X_[:, 3]
X_[:, 3] = X_[:, 3] / X_[:, 3]

b0, b1, b2, b3 = np.dot(np.dot(np.linalg.inv(np.dot(X_.T, X_)), X_.T), y)
beta = np.array([b0, b1, b2, b3])
y_pred4 = np.dot(X_, beta)
errors = y - y_pred4
sigma_squared = np.dot(errors.T, errors)/(X_.shape[0] - X_.shape[1])
print(f"Estimated beta: {beta}")
print(f"Estimated sigma-squared: {sigma_squared}")

Estimated beta: [-2.08725743  1.87248833  2.22553275 -2.13753848]
Estimated sigma-squared: 1.0526638268446493


Проверим библиотечную модель линейной регрессии

In [258]:
X = df[['ex3x_1', 'ex1x_2', 'ex1x_3']].copy().values
y = df['ex1y'].copy().values

lr = LinearRegression().fit(X, y)
y_pred0 = lr.predict(X)

Выберем, какая модель лучше по MSE

In [259]:
y = df['ex1y'].copy().values

mse0 = np.sum((y - y_pred0)**2) # library model
mse1 = np.sum((y - y_pred1)**2)
mse2 = np.sum((y - y_pred2)**2)
mse3 = np.sum((y - y_pred3)**2)
mse4 = np.sum((y - y_pred4)**2)

print(f"Model 0 (library), mse error: {mse0}")
print(f"Model 1, mse error: {mse1}")
print(f"Model 2, mse error: {mse2}")
print(f"Model 3, mse error: {mse3}")
print(f"Model 4, mse error: {mse4}")

Model 0 (library), mse error: 147.29623346892205
Model 1, mse error: 147.29623346892203
Model 2, mse error: 9295.500643296129
Model 3, mse error: 1376.4676001213586
Model 4, mse error: 438772.447737172


 Получается, что модель 1 лучше всех.

### Task 2

Визуализируем распределение остатков

In [10]:
X = df[['Ex2.x1', 'Ex2.x2']].values
y = df['Ex2.y'].values
lr = LinearRegression().fit(X, y)
predictions = lr.predict(X)
errors = y - predictions

In [122]:
y2pred = sorted(list(zip(y, errors)))
traces = []
traces.append(go.Scatter(x=[t[0] for t in y2pred], y=[t[1] for t in y2pred], mode='markers'))
fig = go.Figure(data=traces, layout=go.Layout(title=f'Errors'))
fig.update_xaxes(title='y')
fig.update_yaxes(title="errors")
iplot(fig)

#### a) Критерий Бартлетта

**Ответ:** H0 отклоняется

In [264]:
def my_bartlett(df, n=21, column='Ex2.x1'):
    tmp_df = df.copy().sort_values(column)
    X = tmp_df[['Ex2.x1', 'Ex2.x2']].values
    y = tmp_df['Ex2.y'].values
    X_v = []
    y_v = []
    n_v = []
    for i in range(0, tmp_df.shape[0], n):
        X_v.append(X[i:i+n, :])
        y_v.append(y[i:i+n])
        n_v.append(y_v[-1].shape[0])
    X_v = np.array(X_v)
    y_v = np.array(y_v)
    n_v = np.array(n_v)

    n = df.shape[0]
    n_samples = X_v.shape[0]

    s_v = []
    errors_v = []
    for i in np.arange(len(X_v)):
        lr = LinearRegression().fit(X_v[i], y_v[i])
        y_pred = lr.predict(X_v[i])
        errors = y_v[i] - y_pred
        errors_v.append(errors)
        s_v.append(np.dot(errors.T, errors)/(X_v[i].shape[0] - X_v[i].shape[1] - 1))

    s_v = np.array(s_v)
    Q = n * np.log(np.sum(s_v**2 * n_v/n)) - np.sum(n_v*np.log(s_v**2))
    I = 1 + 1/(3 * (n_samples - 1)) * (np.sum(1/n_v) - 1/n)

    t = Q/I
    chi2_critical_value = chi2.ppf(0.95, df=n_samples-1)

    print(f"Statistics: {t}, chi2 critical value: {chi2_critical_value}")
    if t > chi2_critical_value:
        print("H0 is rejected!")
    else:
        print("H0 can't be rejected")
    return errors_v

Проверим гипотезу при упорядочивании по разным переменным + сравним с библиотечной функцией

In [265]:
errors_v = my_bartlett(df, column='Ex2.x1')
bartlett(*np.array(errors_v))

Statistics: 295.5314575599455, chi2 critical value: 12.591587243743977
H0 is rejected!


BartlettResult(statistic=108.04443147487015, pvalue=5.2316585193163286e-21)

In [266]:
errors_v = my_bartlett(df, column='Ex2.x2')
bartlett(*np.array(errors_v))

Statistics: 54.15997055079484, chi2 critical value: 12.591587243743977
H0 is rejected!


BartlettResult(statistic=12.583588726439611, pvalue=0.05014635739246138)

#### б) Критерий Голдфеда–Куандта

**Ответ:** H0 не отклоняется при разбиении по любой из переменных

In [267]:
def goldfeld_quandt(df, column='Ex2.x1', exclude=0.25):
    tmp_df = df.copy().sort_values(column)
    n = tmp_df.shape[0]
    d = int(0.25 * n)
    m = int((n - d)/2)
    X = tmp_df[['Ex2.x1', 'Ex2.x2']].values
    y = tmp_df['Ex2.y'].values
    X1 = X[0:m, :]
    y1 = y[0:m]
    X2 = X[-m:, :]
    y2 = y[-m:]
    
    lr1 = LinearRegression().fit(X1, y1)
    errors1 = y1 - lr1.predict(X1)
    lr2 = LinearRegression().fit(X2, y2)
    errors2 = y2 - lr2.predict(X2)
    
    t = np.dot(errors1.T, errors1)/np.dot(errors2.T, errors2)
    q = m - 3
    f_critical_value = f.ppf(0.95, q, q)
    print(f"Statistics: {t}, Fisher critical value: {f_critical_value}")
    if t > f_critical_value:
        print("H0 is rejected!")
    else:
        print("H0 can't be rejected")

In [268]:
goldfeld_quandt(df, column='Ex2.x1')

Statistics: 1.0574462862846776, Fisher critical value: 1.5847048974501694
H0 can't be rejected


In [269]:
goldfeld_quandt(df, column='Ex2.x2')

Statistics: 1.2741158483402348, Fisher critical value: 1.5847048974501694
H0 can't be rejected


### Task 3

Проверим гипотезу β = 0 путем построения доверительной области.

**Ответ:** отклонили H0.

In [272]:
df_5 = pd.read_csv('../hw5/IB_statistics_data_hw5.txt', sep=' ')
X = df_5[['ex3x_1', 'ex3x_2', 'ex3x_3']].copy().values
y = df_5['ex3y'].copy().values

In [280]:
X_ = np.concatenate((np.ones([X.shape[0], 1]), X), axis=1) # shape: [n, m_features+1]
b0, b1, b2, b3 = np.dot(np.dot(np.linalg.inv(np.dot(X_.T, X_)), X_.T), y)
beta = np.array([b0, b1, b2, b3])
errors = y - np.dot(X_, beta)
s_squared = np.dot(errors.T, errors)/(X_.shape[0] - X_.shape[1])
n = X_.shape[0]
m = X_.shape[1]
left_part = np.dot(beta.T, np.dot(np.dot(X_.T, X_), beta))
right_part = m/(n-m) * s_squared * f.ppf(0.95, m, n-m)
if left_part <= right_part:
    print(f"{left_part} < {right_part}")
    print("Point [0,0,0,0] in the confidence area. H0 can't be rejected")
else:
    print(f"{left_part} >= {right_part}")
    print("Point [0,0,0,0] not in the confidence area. H0 is rejected")

20625.872138801577 >= 0.1416396188059245
Point [0,0,0,0] not in the confidence area. H0 is rejected


### Task 4

4c) Доверительный параллелепипед и доверительная область

In [283]:
X = df[['Ex4.x1', 'Ex4.x2']].copy().values
y = df['Ex4.y'].copy().values

In [299]:
W = np.linalg.inv(np.dot(X.T, X))
b1, b2 = np.dot(np.dot(W, X.T), y)
beta = np.array([b1, b2])
errors = y - np.dot(X, beta)
n = X.shape[0]
m = X.shape[1]
s_squared = np.dot(errors.T, errors)/(n - m)

# Confidence rectangle
t_val = t.interval(0.95, n-m)[1]
b1_low = b1 - t_val*np.sqrt(W[0, 0]/(n-m)*s_squared)
b1_upp = b1 + t_val*np.sqrt(W[0, 0]/(n-m)*s_squared)
b2_low = b2 - t_val*np.sqrt(W[1, 1]/(n-m)*s_squared)
b2_upp = b2 + t_val*np.sqrt(W[1, 1]/(n-m)*s_squared)

print("---Confidence rectangle---")
print(f"b1 estimation: {b1}, confidence interval:({b1_low}, {b1_upp})")
print(f"b2 estimation: {b2}, confidence interval:({b2_low}, {b2_upp})")

print("---Confidence area---")
right_part = m/(n-m) * s_squared * f.ppf(0.95, m, n-m)
print(f"Confidence area border value: {right_part}")

---Confidence rectangle---
b1 estimation: 13.805946008678244, confidence interval:(13.75999373215832, 13.85189828519817)
b2 estimation: -14.66115030728701, confidence interval:(-14.764362074971816, -14.557938539602203)
---Confidence area---
Confidence area border value: 4.868463346683403


In [332]:
traces = []
traces.append(go.Scatter(x=[b1_low, b1_upp], y=[b2_low, b2_low], line={'color':'black'}))
traces.append(go.Scatter(x=[b1_low, b1_upp], y=[b2_upp, b2_upp], line={'color':'black'}))
traces.append(go.Scatter(x=[b1_low, b1_low], y=[b2_low, b2_upp], line={'color':'black'}))
traces.append(go.Scatter(x=[b1_upp, b1_upp], y=[b2_low, b2_upp], line={'color':'black'}))

x_b1 = np.linspace(13.5, 14, 1000)
y_b2 = np.linspace(-15, -14, 1000)

conf_area_points_x = []
conf_area_points_y = []
for x in x_b1:
    for y in y_b2:
        v = np.array([x, y])
        if np.dot((beta - v).T, np.dot(np.dot(X.T, X), (beta - v))) < right_part:
            conf_area_points_x.append(x)
            conf_area_points_y.append(y)
            
traces.append(go.Scatter(x=conf_area_points_x, y=conf_area_points_y, mode='markers',
                         opacity=0.5, line={'color':'green'}))


fig = go.Figure(data=traces, layout=go.Layout(title='Confidence intervals vs Confidence area',
                                              showlegend=False))
fig.update_xaxes(title="b1")
fig.update_yaxes(title="b2")
iplot(fig)