In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_absolute_error, mean_squared_error, r2_score, recall_score

Загрузка датасета для классификации

In [2]:
df_heart = pd.read_csv('heart.csv')
X_cls = df_heart.drop('target', axis=1)
y_cls = df_heart['target']

Разделение на тестовые и тренировочные

In [3]:
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(
    X_cls, y_cls, test_size=0.2, random_state=42, stratify=y_cls
)

Загрузка датасета для регрессии

In [4]:
df_energy = pd.read_csv('energy_efficiency_data.csv')
X_reg = df_energy.drop(['Heating_Load', 'Cooling_Load'], axis=1)
y_reg = df_energy['Heating_Load']

Определение признаков

In [5]:
num_features = ['Relative_Compactness', 'Surface_Area', 'Wall_Area', 'Roof_Area', 'Overall_Height']
cat_features = ['Orientation', 'Glazing_Area', 'Glazing_Area_Distribution']

Разделение на тестовые и тренировочные

In [6]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

Дерево классификатор

In [7]:
class MyDecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree_ = None

    def _gini(self, y):
        _, counts = np.unique(y, return_counts=True)
        probs = counts / len(y)
        return 1.0 - np.sum(probs ** 2)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_gain = -1
        best_feature, best_threshold = None, None
        parent_gini = self._gini(y)

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_split or np.sum(right_mask) < self.min_samples_split:
                    continue

                gini_left = self._gini(y[left_mask])
                gini_right = self._gini(y[right_mask])
                weighted_gini = (np.sum(left_mask) * gini_left + np.sum(right_mask) * gini_right) / n_samples
                gain = parent_gini - weighted_gini

                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = t

        return best_feature, best_threshold

    def _grow_tree(self, X, y, depth):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.bincount(y).argmax()

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return np.bincount(y).argmax()

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left_subtree = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'feature': feature,
            'threshold': threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        self.tree_ = self._grow_tree(X, y, depth=0)
        return self

    def _predict_single(self, x, node):
        if not isinstance(node, dict):
            return node
        if x[node['feature']] <= node['threshold']:
            return self._predict_single(x, node['left'])
        else:
            return self._predict_single(x, node['right'])

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree_) for x in np.array(X)])

Дерево регрессор

In [8]:
class MyDecisionTreeRegressor:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree_ = None

    def _mse(self, y):
        return np.mean((y - np.mean(y)) ** 2)

    def _best_split(self, X, y):
        n_samples, n_features = X.shape
        best_mse = np.inf
        best_feature, best_threshold = None, None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for t in thresholds:
                left_mask = X[:, feature] <= t
                right_mask = ~left_mask

                if np.sum(left_mask) < self.min_samples_split or np.sum(right_mask) < self.min_samples_split:
                    continue

                mse_left = self._mse(y[left_mask])
                mse_right = self._mse(y[right_mask])
                weighted_mse = (np.sum(left_mask) * mse_left + np.sum(right_mask) * mse_right) / n_samples

                if weighted_mse < best_mse:
                    best_mse = weighted_mse
                    best_feature = feature
                    best_threshold = t

        return best_feature, best_threshold

    def _grow_tree(self, X, y, depth):
        if (self.max_depth is not None and depth >= self.max_depth) or len(y) < self.min_samples_split:
            return np.mean(y)

        feature, threshold = self._best_split(X, y)
        if feature is None:
            return np.mean(y)

        left_mask = X[:, feature] <= threshold
        right_mask = ~left_mask

        left_subtree = self._grow_tree(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self._grow_tree(X[right_mask], y[right_mask], depth + 1)

        return {
            'feature': feature,
            'threshold': threshold,
            'left': left_subtree,
            'right': right_subtree
        }

    def fit(self, X, y):
        X, y = np.array(X), np.array(y)
        self.tree_ = self._grow_tree(X, y, depth=0)
        return self

    def _predict_single(self, x, node):
        if not isinstance(node, dict):
            return node
        if x[node['feature']] <= node['threshold']:
            return self._predict_single(x, node['left'])
        else:
            return self._predict_single(x, node['right'])

    def predict(self, X):
        return np.array([self._predict_single(x, self.tree_) for x in np.array(X)])

Классификатор случайный лес

In [9]:
class MyRandomForestClassifier:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        X, y = np.array(X), np.array(y)
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree = MyDecisionTreeClassifier(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        y_pred = []
        for i in range(X.shape[0]):
            counts = Counter(tree_preds[:, i])
            y_pred.append(counts.most_common(1)[0][0])
        return np.array(y_pred)

Регрессор случайный лес

In [10]:
class MyRandomForestRegressor:
    def __init__(self, n_estimators=10, max_depth=None, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def _bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, n_samples, replace=True)
        return X[idxs], y[idxs]

    def fit(self, X, y):
        self.trees = []
        X, y = np.array(X), np.array(y)
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._bootstrap_sample(X, y)
            tree = MyDecisionTreeRegressor(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)
        return self

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        return np.mean(tree_preds, axis=0)

Метрики для классификации

In [11]:
def evaluate_classification(y_true, y_pred):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'F1': f1_score(y_true, y_pred),
        'ROC-AUC': roc_auc_score(y_true, y_pred),
    }

Метрики для регрессии

In [13]:
def evaluate_regression(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }

Запуск базовой модели skrean (классификация)

In [14]:
sk_rf_cls_base = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
sk_rf_cls_base.fit(X_train_cls, y_train_cls)
y_pred_sk_cls = sk_rf_cls_base.predict(X_test_cls)

Запуск кастомной базовой модели (классификация)

In [15]:
my_rf_cls_base = MyRandomForestClassifier(n_estimators=10, max_depth=3)
my_rf_cls_base.fit(X_train_cls.values, y_train_cls.values)
y_pred_my_cls = my_rf_cls_base.predict(X_test_cls.values)

Вывод метрик классификации

In [16]:
print("Sklearn:", evaluate_classification(y_test_cls, y_pred_sk_cls))
print("Custom :", evaluate_classification(y_test_cls, y_pred_my_cls))

Sklearn: {'Accuracy': 0.8585365853658536, 'F1': 0.8699551569506726, 'ROC-AUC': 0.856904761904762}
Custom : {'Accuracy': 0.8341463414634146, 'F1': 0.8454545454545455, 'ROC-AUC': 0.8328571428571429}


Запуск базовой модели sklearn (регрессия)

In [17]:
sk_rf_reg_base = RandomForestRegressor(n_estimators=10, max_depth=4, random_state=42)
sk_rf_reg_base.fit(X_train_reg, y_train_reg)
y_pred_sk_reg = sk_rf_reg_base.predict(X_test_reg)

Запуск кастомной базовой модели (регрессия)

In [18]:
my_rf_reg_base = MyRandomForestRegressor(n_estimators=10, max_depth=4)
my_rf_reg_base.fit(X_train_reg, y_train_reg)
y_pred_my_reg = my_rf_reg_base.predict(X_test_reg)

Вывод метрик регрессии

In [19]:
print("Sklearn:", evaluate_regression(y_test_reg, y_pred_sk_reg))
print("Custom :", evaluate_regression(y_test_reg, y_pred_my_reg))

Sklearn: {'MAE': 1.2419500138898751, 'RMSE': np.float64(1.6775895132761822), 'R2': 0.9729994749031605}
Custom : {'MAE': 1.253704843957523, 'RMSE': np.float64(1.7184773063268086), 'R2': 0.9716672710010614}


Применим улучшения

Подбор гиперпараметров

In [21]:
param_grid_cls = {
    'n_estimators': [10, 20],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5]
}

grid_cls = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_cls,
    cv=5,
    scoring='f1'
)
grid_cls.fit(X_train_cls, y_train_cls)

best_params = grid_cls.best_params_

Запуск улучшенной модели sklearn (классификация)

In [22]:
best_sk_cls = grid_cls.best_estimator_
y_pred_sk_cls_opt = best_sk_cls.predict(X_test_cls)

Запуск кастомной улучшенной модели (классификация)

In [23]:
my_rf_cls_opt = MyRandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split']
)
my_rf_cls_opt.fit(X_train_cls.values, y_train_cls.values)
y_pred_my_cls_opt = my_rf_cls_opt.predict(X_test_cls.values)

Вывод метрик классификации

In [24]:
print("Sklearn:", evaluate_classification(y_test_cls, y_pred_sk_cls_opt))
print("Custom :", evaluate_classification(y_test_cls, y_pred_my_cls_opt))

Sklearn: {'Accuracy': 0.9902439024390244, 'F1': 0.9905660377358491, 'ROC-AUC': 0.99}
Custom : {'Accuracy': 0.975609756097561, 'F1': 0.9765258215962441, 'ROC-AUC': 0.9752380952380952}


Подбор гиперпараметров

In [27]:
param_grid_reg = {
    'n_estimators': [10, 20],
    'max_depth': [5, 8, None],
    'min_samples_split': [2, 5],
    'max_features': ['sqrt', 'log2']
}

grid_reg = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid_reg,
    cv=3,
    scoring='r2'
)
grid_reg.fit(X_train_reg, y_train_reg)

best_params_r = grid_cls.best_params_

Запуск улучшенной модели sklearn (регрессия)

In [26]:
best_sk_reg = grid_reg.best_estimator_
y_pred_sk_reg_opt = best_sk_reg.predict(X_test_reg)

Запуск кастомной улучшенной модели (регрессия)

In [28]:
my_rf_reg_opt = MyRandomForestRegressor(
    n_estimators=best_params_r['n_estimators'],
    max_depth=best_params_r['max_depth'] if best_params_r['max_depth'] else 10,
    min_samples_split=best_params_r['min_samples_split']
)
my_rf_reg_opt.fit(X_train_reg, y_train_reg)
y_pred_my_reg_opt = my_rf_reg_opt.predict(X_test_reg)

Вывод метрик регрессии

In [29]:
print("Sklearn:", evaluate_regression(y_test_reg, y_pred_sk_reg_opt))
print("Custom :", evaluate_regression(y_test_reg, y_pred_my_reg_opt))

Sklearn: {'MAE': 0.4634192359116387, 'RMSE': np.float64(0.6070182848014892), 'R2': 0.9964648784466134}
Custom : {'MAE': 0.3604511313299315, 'RMSE': np.float64(0.49057626182523106), 'R2': 0.9976910535319575}
