In [1]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from scipy.optimize import minimize_scalar


class RandomForestMSE:
    def __init__(self, n_estimators, max_depth=None, feature_subsample_size=None,
                 **trees_parameters):
        """
        n_estimators : int
            The number of trees in the forest.

        max_depth : int
            The maximum depth of the tree. If None then there is no limits.

        feature_subsample_size : float
            The size of feature set for each tree. If None then use recommendations.
        """

        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.feature_subsample_size = feature_subsample_size
        self.trees_parameters = trees_parameters

    def fit(self, X, y):
        """
        X : numpy ndarray
            Array of size n_objects, n_features

        y : numpy ndarray
            Array of size n_objects
        """

        alpha = 1

        indexes_obj_all = np.arange(X.shape[0])
        self.tree_models = []
        for i in range(self.n_estimators):
            indexes_obj_subset = np.random.choice(indexes_obj_all,
                                                  size=int(alpha * X.shape[0]),
                                                  replace=False)
            dec_tree = None
            dec_tree = DecisionTreeRegressor(*self.trees_parameters,
                                             max_depth=self.max_depth,
                                             max_features=self.feature_subsample_size
                                             )
            dec_tree.fit(X[indexes_obj_subset],
                         y[indexes_obj_subset])
            self.tree_models.append(dec_tree)
            del dec_tree

    def predict(self, X):
        """
        X : numpy ndarray
            Array of size n_objects, n_features

        Returns
        -------
        y : numpy ndarray
            Array of size n_objects
        """

        preds = np.zeros(X.shape[0])
        for i in range(self.n_estimators):
            preds += self.tree_models[i].predict(X) / self.n_estimators
        return preds


class GradientBoostingMSE:
    def __init__(self, n_estimators, learning_rate=0.1, max_depth=5, feature_subsample_size=None,
                 **trees_parameters):
        """
        n_estimators : int
            The number of trees in the forest.

        learning_rate : float
            Use learning_rate * gamma instead of gamma

        max_depth : int
            The maximum depth of the tree. If None then there is no limits.

        feature_subsample_size : float
            The size of feature set for each tree. If None then use recommendations.
        """

        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.feature_subsample_size = feature_subsample_size
        self.trees_parameters = trees_parameters


    def fit(self, X, y):
        """
        X : numpy ndarray
            Array of size n_objects, n_features

        y : numpy ndarray
            Array of size n_objects
        """

        indexes_obj_all = np.arange(X.shape[0])
        alpha = 1
        
        
        self.models_arr = []
        self.alpha_arr = []
        
        predict_sum = np.zeros(X.shape[0])
        

        for i in range(self.n_estimators):
            dec_tree = DecisionTreeRegressor(**self.trees_parameters, max_depth=self.max_depth,
                                             max_features=self.feature_subsample_size)
            indexes_obj_subset = np.random.choice(indexes_obj_all,
                                                  size=int(alpha * X.shape[0]),
                                                  replace=False)
#             self.obj_indexes.append(indexes_obj_subset)
            #optimize model
            
            s_i = 2 * (y - predict_sum)
                
            dec_tree.fit(X, s_i)
            pred_opt = dec_tree.predict(X)
            #optimize model coef
            
            alpha_i = minimize_scalar(lambda alpha_opt: np.mean((- y + predict_sum + alpha_opt * pred_opt) ** 2), 
                                      bounds=(0, 1000),
                                      method='Bounded').x
            
            self.alpha_arr.append(alpha_i * self.learning_rate)
            self.models_arr.append(dec_tree)
            
            predict_sum += self.models_arr[-1].predict(X) * self.alpha_arr[-1]
            


    def predict(self, X):
        """
        X : numpy ndarray
            Array of size n_objects, n_features

        Returns
        -------
        y : numpy ndarray
            Array of size n_objects
        """
        
        pred = 0
        preds_all = []
        for i in range(len(self.models_arr)):
            preds_all.append(self.models_arr[i].predict(X))
            
#             pred += np.sum(np.array(self.alpha_arr)[:i+1, None] * np.array(preds_all), axis=0)
        return np.sum(np.array(self.alpha_arr)[:, None] * np.array(preds_all), axis=0)

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
import pandas as pd

train_data = pd.read_csv('./data/data.csv', index_col=0)
target = pd.read_csv('./data/target.csv', index_col=0)

Удалим id, так как он не несет никакую информацию для модели

In [4]:
train_data.drop(columns='id', inplace=True)
train_data.drop(columns='date', inplace=True)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(train_data, target, train_size=0.8,
                                                    random_state=13)

In [6]:
rand_forest_my = RandomForestMSE(n_estimators=100, max_depth=10, feature_subsample_size=0.9)
rand_forest_sklearn = RandomForestRegressor(n_estimators=100, max_depth=10)

In [7]:
from sklearn.metrics import mean_squared_error

In [8]:
%%time
rand_forest_my.fit(X_train.values, y_train.values)

CPU times: user 6.76 s, sys: 66 µs, total: 6.76 s
Wall time: 6.76 s


In [9]:
preds_my = rand_forest_my.predict(X_test.values)

In [10]:
np.sqrt(mean_squared_error(y_test, preds_my))

355158.0190491212

In [11]:
%%time
rand_forest_sklearn.fit(X_train.values, y_train.values)

  """Entry point for launching an IPython kernel.


CPU times: user 4.8 s, sys: 0 ns, total: 4.8 s
Wall time: 4.8 s


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [12]:
preds_sklearn = rand_forest_sklearn.predict(X_test.values)

In [13]:
np.sqrt(mean_squared_error(y_test, preds_sklearn))

348035.33687800204

In [25]:
gb_my = GradientBoostingMSE(n_estimators=100, max_depth=3, feature_subsample_size=None, learning_rate=0.1)
# rand_forest_sklearn = RandomForestRegressor(n_estimators=100, max_depth=10)

In [26]:
%%time

gb_my.fit(X_train.values, y_train.values.reshape(-1))

CPU times: user 28.8 s, sys: 647 ms, total: 29.4 s
Wall time: 2.73 s


In [27]:
gb_my_preds = gb_my.predict(X_test.values)

In [28]:
np.sqrt(mean_squared_error(y_test, gb_my_preds))

348810.2251005695

In [29]:
from sklearn.ensemble import GradientBoostingRegressor

gb_sklearn = GradientBoostingRegressor(criterion='mse',
                                       n_estimators=100, max_depth=3, max_features=None, learning_rate=0.1)

In [30]:
%%time

gb_sklearn.fit(X_train.values, y_train.values.reshape(-1))

CPU times: user 7.6 s, sys: 172 ms, total: 7.77 s
Wall time: 710 ms


GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [31]:
gb_sklearn_preds = gb_sklearn.predict(X_test.values)

In [32]:
np.sqrt(mean_squared_error(y_test, gb_sklearn_preds))

348781.2020636657