In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn import preprocessing
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score

In [2]:
# Генерируем уникальный seed
my_code = "Bevz"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [3]:
AD = pd.read_csv("../datasets/Fish.csv")

In [4]:
AD

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [5]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(AD))
print(val_test_size)

32


In [6]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(AD, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

95 32 32


In [7]:
train

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
130,Pike,300.0,32.7,35.0,38.8,5.9364,4.3844
112,Perch,685.0,34.0,36.5,39.0,10.8810,6.8640
117,Perch,650.0,36.5,39.0,41.4,11.1366,6.0030
127,Perch,1000.0,41.1,44.0,46.6,12.4888,7.5958
141,Pike,1250.0,52.0,56.0,59.7,10.6863,6.9849
...,...,...,...,...,...,...,...
9,Bream,500.0,28.5,30.7,36.2,14.2266,4.9594
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
6,Bream,500.0,26.8,29.7,34.5,14.1795,5.2785
21,Bream,685.0,31.4,34.0,39.2,15.9936,5.3704


In [8]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
num_columns = ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), num_columns)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Weight', 'Length1', 'Length2', 'Length3',
                                  'Height', 'Width'])])

In [9]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [10]:
sc_train 

Unnamed: 0,0,1,2,3,4,5,6
0,0.178082,0.463265,0.463138,0.48227,0.233406,0.461816,Pike
1,0.412481,0.489796,0.491493,0.485816,0.524522,0.816958,Perch
2,0.391172,0.540816,0.538752,0.528369,0.53957,0.693641,Perch
3,0.604262,0.634694,0.63327,0.620567,0.619182,0.92177,Perch
4,0.756469,0.857143,0.860113,0.852837,0.513059,0.834274,Pike
...,...,...,...,...,...,...,...
90,0.299848,0.377551,0.381853,0.43617,0.721495,0.544171,Bream
91,0.202435,0.283673,0.302457,0.345745,0.612646,0.506459,Bream
92,0.299848,0.342857,0.362949,0.406028,0.718722,0.589874,Bream
93,0.412481,0.436735,0.444234,0.489362,0.825528,0.603036,Bream


In [11]:
# Задание №1 - анализ различных типов ансамблей решений в задаче регрессии
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [12]:
# Устанавливаем названия столбцов
column_names = num_columns + ['Species']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [13]:
# Явно укажем типы данных, это важно для xgboost
types = {
    'Weight' : 'float64',
    'Length1' : 'float64',
    'Length2' : 'float64',
    'Length3' : 'float64',
    'Height' : 'float64',
    'Width' : 'float64',
    'Species' : 'category'
}
sc_train = sc_train.astype(types)
sc_test = sc_test.astype(types)
sc_val = sc_val.astype(types)

In [14]:
# Выбираем 4 числовых переменных, три из них будут предикторами, одна - зависимой переменной
n = 4
labels = random.sample(num_columns, n)

y_labels = labels[0]
x_labels = labels[1:]

print(x_labels)
print(y_labels)

['Length2', 'Length3', 'Weight']
Length1


In [15]:
# Отбор необходимых параметров
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_labels]
y_test = sc_test[y_labels]
y_val = sc_val[y_labels]

In [16]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, XGBRegressor.
# Решите получившуюся задачу регрессии с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# mean_squared_error -> min 

In [17]:
# Общий список моделей
r_models=[BaggingRegressor(), RandomForestRegressor(),GradientBoostingRegressor(), XGBRegressor()]

In [18]:
r_models=[]

# BaggingRegressor
r_models.append(BaggingRegressor())
# RandomForestRegressor
r_models.append(RandomForestRegressor())
# GradientBoostingRegressor
r_models.append(GradientBoostingRegressor())
# XGBRegressor
r_models.append(XGBRegressor())

In [19]:
r_models

[BaggingRegressor(),
 RandomForestRegressor(),
 GradientBoostingRegressor(),
 XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)]

In [20]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [21]:
y_train

array([0.46326531, 0.48979592, 0.54081633, 0.63469388, 0.85714286,
       0.39795918, 0.03061224, 0.27755102, 0.55306122, 0.37755102,
       0.2755102 , 0.13265306, 0.07755102, 0.51020408, 1.        ,
       0.31836735, 0.21428571, 0.44489796, 0.41632653, 0.06530612,
       0.15918367, 0.15306122, 0.28571429, 0.21428571, 0.46326531,
       0.35918367, 0.5       , 0.44693878, 0.39591837, 0.18979592,
       0.42653061, 0.28571429, 0.60816327, 0.67755102, 0.1755102 ,
       0.2244898 , 0.04285714, 0.24489796, 0.15306122, 0.22857143,
       0.55102041, 0.18367347, 0.55102041, 0.18367347, 0.05918367,
       0.3244898 , 0.12653061, 0.30612245, 0.53469388, 0.21836735,
       0.93877551, 0.26938776, 0.16734694, 0.36326531, 0.35918367,
       0.6122449 , 0.47959184, 0.43877551, 0.24489796, 0.01428571,
       0.39591837, 0.21428571, 0.50204082, 0.2       , 0.54897959,
       0.24489796, 0.17142857, 0.19183673, 0.2877551 , 0.28571429,
       0.24489796, 0.50612245, 0.2244898 , 0.71020408, 0.00816

In [22]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

BaggingRegressor()
RandomForestRegressor()
GradientBoostingRegressor()
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


In [23]:
# Оценииваем качество работы моделей на валидационной выборке.
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

BaggingRegressor() 	 0.00015013926488962974
RandomForestRegressor() 	 0.00011169119637651195
GradientBoostingRegressor() 	 0.00010225075484903416
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 	 0.00011617226624558561


In [24]:
# Выбираем лучшую модель
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

GradientBoostingRegressor()

In [25]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.0002966800806848336


In [26]:
# Задание №2 - анализ различных типов ансамблей в задаче классификации
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [27]:
n = 2
x_labels = random.sample(num_columns, n)
y_label = 'Species'

print(x_labels)
print(y_label)

['Weight', 'Length3']
Species


In [28]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [29]:
y_train

0      Pike
1     Perch
2     Perch
3     Perch
4      Pike
      ...  
90    Bream
91    Bream
92    Bream
93    Bream
94    Perch
Name: Species, Length: 95, dtype: category
Categories (7, object): ['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish']

In [30]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier
# Решите получившуюся задачу классификации с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# f1_score -> max

In [31]:
# Преобразовываем метки классов в целые числа
le = preprocessing.LabelEncoder()
y_test


0     Whitefish
1         Smelt
2        Parkki
3         Perch
4         Perch
5         Bream
6         Perch
7         Perch
8         Perch
9          Pike
10        Perch
11        Bream
12        Bream
13        Perch
14        Bream
15        Smelt
16        Bream
17        Perch
18        Roach
19        Roach
20        Perch
21        Smelt
22         Pike
23    Whitefish
24        Perch
25        Bream
26        Perch
27        Roach
28       Parkki
29        Smelt
30        Smelt
31         Pike
Name: Species, dtype: category
Categories (7, object): ['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish']

In [32]:
le.fit(y_train)
le.fit(y_val)
le.fit(y_test)

LabelEncoder()

In [33]:
y_train=le.transform(y_train)
y_val=le.transform(y_val)
y_test=le.transform(y_test)

In [34]:
y_train

array([3, 2, 2, 2, 3, 4, 5, 4, 2, 6, 2, 4, 5, 0, 3, 6, 2, 0, 0, 5, 2, 1,
       0, 2, 0, 0, 2, 0, 0, 2, 0, 4, 2, 3, 4, 2, 5, 2, 4, 1, 2, 2, 2, 2,
       4, 2, 2, 4, 0, 2, 3, 0, 4, 2, 0, 3, 0, 0, 4, 5, 0, 4, 2, 1, 2, 2,
       1, 4, 6, 1, 4, 3, 4, 3, 5, 2, 6, 2, 2, 0, 3, 2, 0, 2, 1, 5, 4, 2,
       0, 0, 0, 0, 0, 0, 2])

In [35]:
a_models=[BaggingClassifier(),     
          RandomForestClassifier(),    
          GradientBoostingClassifier(),    
          XGBClassifier()]

In [36]:
a_models=[]

# Bagging
a_models.append(BaggingClassifier())

# RandomForest
a_models.append(RandomForestClassifier())

# GradientBoosting
a_models.append(GradientBoostingClassifier())

# XGB
a_models.append(XGBClassifier(use_label_encoder=False,
                              eval_metric='mlogloss'))


In [37]:
# Обучение моделей
for model in a_models:
    print(model)
    model.fit(x_train, y_train)

BaggingClassifier()
RandomForestClassifier()
GradientBoostingClassifier()
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)


In [38]:
# Оцениваем качество работы моделей на валидационной выборке
f1s = []
for model in a_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(model, '\t', f1)

BaggingClassifier() 	 0.459258207070707
RandomForestClassifier() 	 0.514371565934066
GradientBoostingClassifier() 	 0.5756623931623932
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...) 	 0.4822443181818182


In [39]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_a_model = a_models[i_max]
best_a_model

GradientBoostingClassifier()

In [40]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_a_model.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.5095561594202899


In [41]:
# Выполнено по анологии, как выше