# Сравнение моделей

In [None]:
import numpy as np
import scipy.stats as st
import matplotlib.pylab as plt
import pandas as pd

In [None]:
X = np.random.randn(1000, 2)
w = np.array([1., 1.])
y = np.dot(X,w)+ np.random.randn(1000)*0.1
noise = np.random.randn(1000)

In [None]:
plt.scatter(noise, y)

In [None]:
plt.scatter(X[:, 0], y)

In [None]:
plt.scatter(X[:, 1], y)

In [None]:
plt.scatter(np.sum(X, 1), y)

In [None]:
from statsmodels.regression.linear_model import OLS

simple_model = OLS(y, X[:,0]).fit()
simple_model.summary()

In [None]:
from statsmodels.regression.linear_model import OLS
dataframe =   pd.DataFrame({'y':y,'x1':X[:,0]})
simple_model = OLS.from_formula('y~x1 -1', dataframe).fit()
simple_model.summary()

In [None]:
from statsmodels.regression.linear_model import OLS

noise_model = OLS(y,noise).fit()
noise_model.summary()

In [None]:
from statsmodels.regression.linear_model import OLS
dataframe =   pd.DataFrame({'y':y, 'x1':X[:,0], 'x2':X[:,1]})
complex_model = OLS.from_formula('y~x1 + x2 -1', dataframe).fit()
complex_model.summary()

In [None]:
complex_model.compare_f_test(simple_model)

In [None]:
complex_model.compare_lr_test(simple_model)

In [None]:
complex_model.compare_lm_test(simple_model)

In [None]:
from statsmodels.regression.linear_model import OLS
dataframe =   pd.DataFrame({'y':y, 'noise':noise, 'x1':X[:,0], 'x2':X[:,1]})
complex_model2 = OLS.from_formula('y~x1 + x2 + noise -1', dataframe).fit()
complex_model2.summary()

In [None]:
complex_model2.compare_lm_test(complex_model)

In [None]:
complex_model2.compare_lm_test(noise_model)

In [None]:
simple_model.summary()

In [None]:
complex_model.summary()

# Выбор моделей: случай невложенных моделей

In [None]:
X = np.random.randn(100)
y = X**2 + np.random.randn(100)*0.1
plt.scatter(X, y)

In [None]:
plt.scatter(X**2, y)

In [None]:
model1 = OLS(y, X).fit()
model1.summary()


In [None]:
model2 = OLS(y, X**2).fit()
model2.summary()


In [None]:
y_predicted1 = model1.predict(X)
y_predicted2 = model2.predict(X**2)

In [None]:
plt.scatter(y_predicted1, y)

In [None]:
plt.scatter(y_predicted2, y)

In [None]:
data = pd.DataFrame({'y':y, 'y1':y_predicted1, 'y2': y_predicted2, 'X':X, 'X2':X**2})
model1_with_y_2 = # ваш код
model2_with_y_1 = # ваш код


In [None]:
model1_with_y_2.summary()

In [None]:
model2_with_y_1.summary()

# Кодирование категориальных переменных

In [None]:
data = [1]*10+[2]*7 + [3]*5
np.random.shuffle(data)
data = np.array(data)
data

In [None]:
from patsy.contrasts import Treatment
levels = [1,2,3]
contrast = Treatment().code_without_intercept(levels)
print(contrast.matrix)

In [None]:
contrast.matrix[data-1]

In [None]:
from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)
print(contrast.matrix)

In [None]:
contrast.matrix[data-1]

# Метод Бокса-Кокса

In [None]:
rs = np.random.RandomState(42)
y = rs.randn(100)
print (np.array(y>1).sum())
tails = (y)>1
y[tails]*=1.45

y = y-np.min(y)+1
plt.hist(y)
print (y.min())
st.shapiro(y)

In [None]:
for l in [-5, -2, -1, -0.5, 0, 0.5, 1, 2, 5]:
    plt.hist(st.boxcox(y, l))   
    plt.title('lambda = '+str(l))
    plt.show()

In [None]:
for l in np.arange(-2.0,  2.0, 0.05):
    
    print (l, st.shapiro(st.boxcox(y,l)))
    

In [None]:
st.boxcox(y)

In [None]:
rs = np.random.RandomState(42)
X = rs.randn(100)
X .sort()
error = rs.randn(100)*0.1*np.arange(100)
y = X+error
y = y-np.min(y)+1
plt.scatter(X,y)
print (y.min())

In [None]:
model = OLS(y, X).fit()
predicted = model.predict(X)
plt.scatter(predicted, y-predicted )


In [None]:
def v(lam):
    # ваш код

In [None]:
for l in [-2, -1, -0.5, 0, 0.5, 1, 2]:
    print (np.sum(v(l) - st.boxcox(y,l)))

In [None]:
model = OLS(y, X).fit()
predicted = model.predict(X)
plt.scatter(predicted, y-predicted )
plt.title('original')
plt.ylim((-2, 50))
plt.show()
    
for l in [-10, -5, -1, -0.5, 0, 0.5, 1]:
    model = OLS(v(l), X).fit()
    predicted = model.predict(X)
    plt.scatter(predicted, v(l)-predicted )
    plt.title('lambda = '+str(l))
    plt.ylim((-5, 50))
    plt.show()
    

In [None]:
l = np.arange(-5, 5, 0.1)
r = []
for lam in l:   
    model = OLS(v(lam), X).fit()
    r.append(np.log(model.mse_resid))
plt.plot(l, r)
l[np.argmin(r)]

In [None]:
model = OLS(y, X).fit()
print (model.mse_resid)

In [None]:
model.summary()

In [None]:
model = OLS(st.boxcox(y)[0], X).fit()
print (model.mse_resid)

In [None]:
model.summary()

In [None]:
model = OLS(v(-5), X).fit()
print (model.mse_resid)

In [None]:
model.summary()