# Лабораторная работа №4

## Выполнил Попов Матвей, М8О-408Б-20


Будем решать задачу **регрессии**

### Дерево решений

In [25]:
import numpy as np

class DecisionTreeRegressor:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        if depth == self.max_depth or len(np.unique(y)) == 1:
            return np.mean(y)

        feature_index, threshold = self._find_best_split(X, y)

        if feature_index is None:
            return np.mean(y)

        left_mask = X[:, feature_index] <= threshold
        right_mask = ~left_mask

        left_subtree = self._build_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._build_tree(X[right_mask], y[right_mask], depth + 1)

        return (feature_index, threshold, left_subtree, right_subtree)

    def _find_best_split(self, X, y):
        m, n = X.shape
        if m <= 1:
            return None, None

        y_var = np.var(y)
        best_feature_index, best_threshold = None, None
        best_score = float('inf')

        for feature_index in range(n):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = ~left_mask

                if np.sum(left_mask) == 0 or np.sum(right_mask) == 0:
                    continue

                left_score = np.var(y[left_mask])
                right_score = np.var(y[right_mask])

                weighted_score = (np.sum(left_mask) * left_score + np.sum(right_mask) * right_score) / m

                if weighted_score < best_score:
                    best_score = weighted_score
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def predict_instance(self, x, tree):
        if isinstance(tree, (float, np.float64)):
            return tree
        feature_index, threshold, left_subtree, right_subtree = tree
        if x[feature_index] <= threshold:
            return self.predict_instance(x, left_subtree)
        else:
            return self.predict_instance(x, right_subtree)

    def predict(self, X):
        return np.array([self.predict_instance(x, self.tree) for x in X])


### Случайный лес

In [31]:
class RandomForestRegressor:
    def __init__(self, n_estimators=100, max_depth=None, max_features='auto', random_state=None):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.max_features = max_features
        self.random_state = random_state
        self.estimators = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        for _ in range(self.n_estimators):
            indices = np.random.choice(len(X), size=len(X), replace=True)
            X_subset, y_subset = X[indices], y[indices]
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X_subset, y_subset)
            self.estimators.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.estimators])
        return np.mean(predictions, axis=0)
    

### Градиентный бустинг

In [32]:
class GradientBoostingRegressor:
    def __init__(self, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=None):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.random_state = random_state
        self.estimators = []
        self.weights = []

    def fit(self, X, y):
        np.random.seed(self.random_state)
        predictions = np.zeros_like(y, dtype=np.float64)

        for _ in range(self.n_estimators):
            residuals = y - predictions
            tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(X, residuals)
            tree_predictions = tree.predict(X)
            predictions += self.learning_rate * tree_predictions
            self.estimators.append(tree)
            self.weights.append(self.learning_rate)

    def predict(self, X):
        predictions = np.sum([self.learning_rate * tree.predict(X) for tree in self.estimators], axis=0)
        return predictions


### Результаты

Сгенерируем датасет и разделим его на обучающую и тестовую выборки

In [33]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

np.random.seed(42)
X = np.random.rand(100, 1)
y = 2 * X.squeeze() + 1 + 0.1 * np.random.randn(100)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2904)


#### Дерево решений

In [34]:
tree_regressor = DecisionTreeRegressor(max_depth=5)
tree_regressor.fit(X_train, y_train)
y_pred = tree_regressor.predict(X_test)
decision_tree_mse = mean_squared_error(y_test, y_pred)


#### Случайный лес

In [36]:
random_forest_regressor = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
random_forest_regressor.fit(X_train, y_train)
y_pred = random_forest_regressor.predict(X_test)
random_forest_mse = mean_squared_error(y_test, y_pred)


#### Градиентный бустинг

In [37]:
gradient_boosting_regressor = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gradient_boosting_regressor.fit(X_train, y_train)
y_pred = gradient_boosting_regressor.predict(X_test)
gradient_boosting_mse = mean_squared_error(y_test, y_pred)


Сравним результаты

In [38]:
print('Decision Tree MSE:', decision_tree_mse)
print('Random Forest MSE:', random_forest_mse)
print('Gradient Boosting MSE:', gradient_boosting_mse)


Decision Tree MSE: 0.012674190592588166
Random Forest MSE: 0.010240056232649402
Gradient Boosting MSE: 0.011360456410326491


Как видим, все три алгоритма примерно одинаково справляются с задачей регрессии.