### Решение задачи регрессии, где в качестве целевой функции выступает графа "доход" (revenue)
Обученная модель должена определить, какой доход (revenue) можно ждать от определенного пользователя.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import mean_squared_error


In [5]:
from category_encoders.binary import BinaryEncoder

In [6]:
# Загрузка урезанной версии выборки
dataset = pd.read_csv('dataset_cut.csv')

In [7]:
# Выбор нужных колонок для массива признаков (features)
# и их кодирование при помощи BinaryEncoder
X = dataset.drop(['revenue'], axis=1)
ce_bin = BinaryEncoder(cols = X.columns)
X = ce_bin.fit_transform(X)

  elif pd.api.types.is_categorical(cols):


In [8]:
# Переменные отклика (target)
y = dataset.revenue

In [9]:
# Разделение данных на обучающие и тестовые
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

#### Random forest

In [134]:
# Определение модели и гиперпараметров 
rf_reg = RandomForestRegressor()
parameters = {'n_estimators': range(20, 80, 10), 
             'max_depth': range(6,13, 2),
            'min_samples_leaf': range(1,7), 
             'min_samples_split': range(2,10, 2)}

In [135]:
# Создание экземпляра RandomizedSearch (/GridSearch)
# search = RandomizedSearchCV(rf_reg, parametrs, n_iter=50, 
#                             cv=3, n_jobs=-1)
search = GridSearchCV(rf_reg, parameters, cv = 3, n_jobs=-1)

In [136]:
# Обучение
search.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'max_depth': range(6, 13, 2),
                         'min_samples_leaf': range(1, 7),
                         'min_samples_split': range(2, 10, 2),
                         'n_estimators': range(20, 80, 10)})

In [137]:
best_tree_rf = search.best_estimator_
best_tree_rf

RandomForestRegressor(max_depth=10, n_estimators=50)

In [138]:
# Предсказанные значения для тестовой выборки
predictions = best_tree_rf.predict(X_test)

In [139]:
# Коэффициент детерминация для обучающей выборки
best_tree_rf.score(X_train, y_train, sample_weight=None)

0.6754214582476459

In [140]:
# Коэффициент детерминация для тестовой выборки
best_tree_rf.score(X_test, y_test, sample_weight=None)

# Результаты обучения можно назвать неудовлетворительными

0.4196878861380692

#### XGBoost

In [131]:
xg_reg = xgb.XGBRegressor()
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [0.03, 0.07, 0.1],
              'reg_alpha': [5, 10],
              'max_depth': [6, 8, 10],
              'subsample': [0.7, 0.8],
              'colsample_bytree': [0.3, 0.5, 0.7],
              'n_estimators': [200, 300]}

In [132]:
search = RandomizedSearchCV(xg_reg, parameters, n_iter=50, cv = 3, n_jobs = -1)

In [124]:
search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=...
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parame

In [125]:
search.best_score_

0.4258997580725071

In [126]:
best_tree_xgb = search.best_estimator_
best_tree_xgb

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.5, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=300, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=10, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [127]:
best_tree_xgb.score(X_train, y_train)

0.6432268649828481

In [128]:
best_tree_xgb.score(X_test, y_test)

0.44952332826973207