## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [2]:
import numpy as np

In [18]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob
        
    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
            # Your Code Here
            # Нужно создать self.num_bags бутстраппированных выборок
            indices = np.random.choice(data_length, size=data_length, replace=True)
            self.indices_list.append(indices)

        
    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.
        
        example:
        
        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]] # Your Code Here
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here
        if self.oob:
            self.data = data
            self.target = target
        
    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        # Your code here
        preds = np.zeros(len(data))  # Создаем контейнер под предсказания
        for model in self.models_list:  # Каждая модель предсказывает данные
            preds += model.predict(data)
        # Усредняем по моделям (т.е. по бутстрапированным выборкам)
        return preds / self.num_bags

    
    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        # Прогон по всем объектам датасета
        for idx in range(len(self.data)):
            sample = self.data[idx].reshape(1, -1)  # см. документацию: "-1" это любое число. Иначе говоря - нам нужен вектор
            models_predictions = []

            # Прогон по всем моделям
            for bag in range(self.num_bags):
                # выполнение условия /*which have not seen this object during training phase*/
                if idx not in self.indices_list[bag]:
                    models_predictions.append(float(self.models_list[bag].predict(sample)))

            # Напомню - /*where list i contains predictions for self.data[i] object from all models*/
            list_of_predictions_lists[idx] = models_predictions

        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
    
    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        # Your Code Here

        # Инициализация контейнера под out of back предсказания
        self.oob_predictions = np.zeros(len(self.data))

        # Прогон по всем объектам датасета
        for idx in range(len(self.data)):
            models_predictions = self.list_of_predictions_lists[idx]

            # Реализация условия /*If object has been used in all bags on training phase, return None instead of prediction*/
            if len(models_predictions) == 0:
                self.oob_predictions[idx] = None
            else:
                self.oob_predictions[idx] = sum(models_predictions) / len(models_predictions)
        
        
    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        
        # mean без учета nan'ов
        return np.nanmean((self.target - self.oob_predictions) ** 2) # Your Code Here

### Local tests:

In [16]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

#### Simple tests:

In [19]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()

    print(oob_score)

    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Simple tests done!')

  0%|          | 0/100 [00:00<?, ?it/s]

[-0.08911306 -0.11891895  0.39539637 ... -0.20856778  0.07337657
  0.03792662]
[-0.08911306 -0.11891895  0.39539637 ... -0.20856778  0.07337657
  0.03792662]
8.634468917175324e-32
[ 0.31596502  0.2882396  -0.06551073 ...  0.19701752  0.10376751
 -0.0601928 ]
[ 0.31596502  0.2882396  -0.06551073 ...  0.19701752  0.10376751
 -0.0601928 ]
1.3365533893464382e-31
[ 0.08980588  0.2380314   0.27636937 ... -0.53497091 -0.45994243
 -0.12128394]
[ 0.08980588  0.2380314   0.27636937 ... -0.53497091 -0.45994243
 -0.12128394]
6.334156895753573e-32
[ 0.28223103  0.08092177  0.02576543 ... -0.511407   -0.57170735
  0.08975159]
[ 0.28223103  0.08092177  0.02576543 ... -0.511407   -0.57170735
  0.08975159]
5.845995300568335e-32
[ 0.27048174  0.17643236  0.05880171 ...  0.72350236  0.13683571
 -0.52591282]
[ 0.27048174  0.17643236  0.05880171 ...  0.72350236  0.13683571
 -0.52591282]
8.551767489543528e-32
[ 0.34490328 -0.08543439  0.05178707 ...  0.27622187 -0.5648081
  0.42440605]
[ 0.34490328 -0.08543

#### Medium tests

In [20]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'
    
print('Medium tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

[ 0.06558898 -0.76941301  1.43438251  0.78181388 -0.0495858   1.0402175
 -0.34643944 -0.67033139  0.07005061 -1.48399938 -1.04001294 -0.40106818
 -0.4707801   0.53780815 -1.79323871  0.90972056 -0.56086639  1.26958811
  0.59613544  0.59785134  0.28599739  0.90948384 -2.37168832 -0.62952073
 -1.26056573  2.09042655 -1.14555282 -1.12472153  1.78760378  0.99306183
 -0.86862215 -0.19487057  0.29136143 -0.26079452 -1.93588625 -0.95056303
 -0.9809444  -0.04761481 -1.20905893  0.77540912  1.76701676  0.55383746
 -0.61552318  1.85262815 -2.37313598 -0.99930047  0.25494966  0.06159887
 -0.83061605  1.97911964  0.50369651  0.45710646  0.2980309   1.67689841
 -1.08206596  2.13852457  0.17548176 -1.39886058 -1.12368602 -0.52202333
 -0.67106676  0.55805801 -0.27554179  0.89410207  0.46324794  0.97305788
  1.1347435   0.48025055  0.36223962  1.98858895 -0.05319998  0.19581086
  0.19068866 -0.41417185  0.27926823  1.20719467  0.64581055  0.76444191
 -0.74631504  0.43841215  0.56411437 -1.36390517 -0.

#### Complex tests:

In [21]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'
    
print('Complex tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

[-0.27640188  0.06140603 -0.75743442 ... -0.61319058  0.64023338
  0.12245124]
[-0.03557136  0.04024068  0.02111612 ... -0.00862527 -0.0292756
 -0.0247829 ]
[-1.42772655  0.39108672 -0.58141939 ... -0.54437566  0.96530442
  0.07452574]
[-0.05076444  0.06948592  0.0980697  ...  0.07881982 -0.11255861
 -0.02193477]
[ 1.02670133  1.85599331  0.09653158 ...  0.37055269 -0.68102293
  0.53036518]
[ 0.01639049 -0.0416654  -0.00144903 ... -0.09637682 -0.041881
  0.00623993]
[ 3.64134342 -0.69426788  0.09315068 ... -0.64109287 -1.35438467
  2.02936657]
[-0.03166886 -0.06344685  0.07625383 ...  0.0118322   0.02914413
 -0.18875871]
[-0.33510198  0.6788898  -0.13203918 ...  1.28238201  0.16377981
  0.55357894]
[ 0.10043498  0.03950353 -0.06157718 ...  0.04114635  0.01302697
  0.01461598]
[-1.18247502  0.35105598 -0.4556901  ...  1.65514464  0.18815579
  1.87671782]
[-0.00025835  0.18138135 -0.05195223 ... -0.05680844  0.1778251
 -0.20590762]
[-0.80419543 -0.7306573  -2.07724169 ...  0.15155642  0.

In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!