<a href="https://colab.research.google.com/github/nedokormysh/OpenEdu_HSE_INTRML/blob/week4/week_4_experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.base import BaseEstimator


np.random.seed(0)

warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
class LinearRegressionVectorized(BaseEstimator):
    def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
        """
        epsilon: разница для нормы изменения весов 
        max_steps: максимальное количество шагов в градиентном спуске
        w0: np.array (d,) - начальные веса
        alpha: шаг обучения
        """
        self.epsilon = epsilon
        self.max_steps = max_steps
        self.w0 = w0
        self.alpha = alpha
        self.w = None
        self.w_history = []
    
    def fit(self, X, y):
        """
        X: np.array (l, d)
        y: np.array (l)
        ---
        output: self
        """
        l, d = X.shape

        if self.w0 is None:
          self.w0 = np.zeros(d)

        self.w = self.w0

        for step in range(self.max_steps):
          self.w_history.append(self.w)

          w_new = self.w - self.alpha * self.calc_gradient(X, y)

          if (np.linalg.norm(w_new - self.w) < self.epsilon):
            break
          
          self.w = w_new
        
        return self
    '''
    У нас будет новый класс линейной регрессии, функцию fit, которая обучает, мы оставляем без изменений, нам нужно изменить
    только подсчет градиентов и функцию, которая дает нам предсказание. 
    '''

    def predict(self, X):
        """
        X: np.array (l, d)
        ---
        output: np.array (l)
        """
        
        if self.w is None:
            raise Exception('Not trained yet')
        # нам нужно матричный умножить x - матрицу объектов-признаков - на наш вектор весов w.
        return np.dot(X, self.w)

    
    def calc_gradient(self, X, y):
        """
        X: np.array (l, d)
        y: np.array (l)
        ---
        output: np.array (d)
        """
        l, d = X.shape

        return (2/l) * np.dot(X.T,(np.dot(X, self.w) - y))
        # return np.dot(X, self.w) 

In [None]:
n_features = 100
n_objects = 10000
num_steps = 2
'''
У нас для того, чтобы сравнение было более показательным, будет
100 признаков и 10 тысяч объектов, и для того,
чтобы наш пример отработал быстро, поставим количество шагов равным двум.
'''


np.random.seed(1)
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))

X = np.random.uniform(-5, 5, (n_objects, n_features))
y = np.dot(X, w_true) + np.random.normal(0, 1, (n_objects))

In [None]:
X.shape

(10000, 100)

In [None]:
w_0.shape

(100,)

In [None]:
y.shape

(10000,)

In [None]:
np.dot(X, w_0)-y

array([  3.52952381, -29.87227682, -88.91821667, ..., -38.84485533,
        -1.83118518,   1.60728795])

In [None]:
def prep(X, y):
  n_features = X.shape[1]
  print(n_features)
  n_objects = X.shape[0]
  print(n_objects)
  num_steps = 2

  np.random.seed(1)
  w_0 = np.random.uniform(-2, 2, (n_features))

  return w_0

In [None]:
lr_vectorized = LinearRegressionVectorized(w0=w_0, alpha=0.01)
lr_vectorized.fit(X, y)

LinearRegressionVectorized(w0=array([ 1.22842078, -0.44855742,  1.45416742,  0.98848657,  0.22496094,
       -1.4541791 , -1.76032924, -1.51462618, -1.82179249, -1.57002348,
       -1.09716265,  0.85195592,  0.23886793, -1.94977608, -1.71210288,
        1.86910532,  0.27240185, -1.18682706, -0.99069702,  0.97530342,
       -1.21828208,  0.32543571,  1.88007996,  1.38731521, -1.04060896,
       -0.02492114,  0.47982287,  1.3...
        0.22661275,  1.6624254 ,  0.56626484, -0.43996914, -0.05603733,
        0.41724193,  0.19819169,  1.70472571,  1.67493374, -0.42049755,
        1.85305011, -1.30417733, -1.49468192, -1.45968337,  0.02264866,
       -1.91390078,  1.79188084,  1.30846188, -1.93992408, -1.29521498,
       -0.6717457 , -1.47601262,  1.23796277, -0.62105339,  1.76042993,
        0.32805672,  1.51532794,  1.37893778,  1.62156927, -0.16047894,
        0.18538726,  1.19441436, -0.85712459, -0.03898591,  0.39644123]))

In [None]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

data = load_boston()
X_boston = pd.DataFrame(data.data, columns=data.feature_names)
y_boston = data.target

X_train, X_test, y_train, y_test = train_test_split(np.array(X_boston), y_boston, test_size=0.3, random_state=10)

In [None]:
w_prep = prep(X_train, y_train)

13
354


In [None]:
X_train.shape

(354, 13)

In [None]:
w_prep.shape

(13,)

In [None]:
# np.dot(X_train, w_prep)-y_train

In [None]:
lr_vectorized = LinearRegressionVectorized(w0=w_prep, alpha=0.01)
lr_vectorized.fit(X_train, y_train)

LinearRegressionVectorized(w0=array([-0.33191198,  0.88129797, -1.9995425 , -0.79066971, -1.41297644,
       -1.63064562, -1.25495915, -0.61775709, -0.4129301 ,  0.15526694,
       -0.32322194,  0.740878  , -1.182191  ]))