In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, f1_score

In [2]:
# Генерируем уникальный seed
my_code = "Михаил Якимов"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [3]:
data1 = pd.read_csv("datasets/Fish.csv")

In [4]:
data1

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [5]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(data1))
print(val_test_size)

32


In [6]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(data1, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

95 32 32


In [7]:
train

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
63,Parkki,90.0,16.3,17.7,19.8,7.4052,2.6730
14,Bream,600.0,29.4,32.0,37.2,14.9544,5.1708
85,Perch,130.0,19.3,21.3,22.8,6.3840,3.5340
5,Bream,450.0,26.8,29.7,34.7,13.6024,4.9274
88,Perch,130.0,20.0,22.0,23.5,6.1100,3.5250
...,...,...,...,...,...,...,...
97,Perch,145.0,22.0,24.0,25.5,6.3750,3.8250
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
124,Perch,1000.0,39.8,43.0,45.2,11.9328,7.2772


In [8]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
num_columns = ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), num_columns)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Weight', 'Length1', 'Length2', 'Length3',
                                  'Height', 'Width'])])

In [9]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [10]:
sc_train 

Unnamed: 0,0,1,2,3,4,5,6
0,0.052281,0.149893,0.157371,0.169173,0.335766,0.22911,Parkki
1,0.372372,0.430407,0.442231,0.496241,0.78228,0.581191,Bream
2,0.077387,0.214133,0.229084,0.225564,0.275365,0.350474,Perch
3,0.278228,0.374732,0.396414,0.449248,0.702313,0.546882,Bream
4,0.077387,0.229122,0.243028,0.238722,0.259159,0.349205,Perch
...,...,...,...,...,...,...,...
90,0.086801,0.271949,0.282869,0.276316,0.274833,0.391492,Perch
91,0.003452,0.059957,0.063745,0.056391,0.032448,0.029347,Smelt
92,0.223624,0.364026,0.38247,0.426692,0.650713,0.480365,Bream
93,0.623423,0.653105,0.661355,0.646617,0.603561,0.878101,Perch


In [11]:
# Задание №1 - анализ различных типов ансамблей решений в задаче регрессии
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [12]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [13]:
# Устанавливаем названия столбцов
column_names = num_columns + ['Species']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [14]:
# Явно укажем типы данных, это важно для xgboost
types = {
    'Weight' : 'float64',
    'Length1' : 'float64',
    'Length2' : 'float64',
    'Length3' : 'float64',
    'Height' : 'float64',
    'Width' : 'float64'
}
sc_train = sc_train.astype(types)
sc_test = sc_test.astype(types)
sc_val = sc_val.astype(types)

In [15]:
# Выбираем 4 числовых переменных, три их них будут предикторами, одна - зависимой переменной
n = 4
labels = random.sample(num_columns, n)
y_labels = labels[0]
x_labels = labels[1:]
print(x_labels)
print(y_labels)

['Length3', 'Height', 'Weight']
Length1


In [16]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_labels]
y_test = sc_test[y_labels]
y_val = sc_val[y_labels]

In [17]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, XGBRegressor.
# Решите получившуюся задачу регрессии с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# mean_squared_error -> min 

In [18]:
# Общий список моделей
r_models=[
BaggingRegressor(), 
    RandomForestRegressor(),
    GradientBoostingRegressor(), 
    XGBRegressor()]

In [19]:
r_models=[]

# Регрессор Бэгинга
r_models.append(BaggingRegressor())

# случайный лес
r_models.append(RandomForestRegressor())

# градиентный бустинг
r_models.append(GradientBoostingRegressor())

# eXtrem Gradient Boosting
r_models.append(XGBRegressor())

In [20]:
r_models

[BaggingRegressor(),
 RandomForestRegressor(),
 GradientBoostingRegressor(),
 XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)]

In [21]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [22]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

BaggingRegressor()
RandomForestRegressor()
GradientBoostingRegressor()
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


In [23]:
# Оценииваем качество работы моделей на валидационной выборке.
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

BaggingRegressor() 	 0.0012459477552742228
RandomForestRegressor() 	 0.0012179831828978073
GradientBoostingRegressor() 	 0.0005231486182890757
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 	 0.0005323823896400085


In [24]:
# Выбираем лучшую модель
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

GradientBoostingRegressor()

In [25]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.000189456724810135


In [26]:
# Задание №2 - анализ различных типов ансамблей в задаче классификации
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [27]:
n = 2
x_labels = random.sample(num_columns, n)
y_label = ['Species']

print(x_labels)
print(y_label)

['Width', 'Length3']
['Species']


In [28]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_label])
y_test = np.ravel(sc_test[y_label])
y_val = np.ravel(sc_val[y_label])

In [29]:
y_train

array(['Parkki', 'Bream', 'Perch', 'Bream', 'Perch', 'Pike', 'Roach',
       'Perch', 'Perch', 'Bream', 'Perch', 'Perch', 'Parkki', 'Whitefish',
       'Whitefish', 'Perch', 'Bream', 'Smelt', 'Perch', 'Pike', 'Smelt',
       'Parkki', 'Perch', 'Bream', 'Perch', 'Perch', 'Parkki', 'Smelt',
       'Bream', 'Bream', 'Perch', 'Smelt', 'Bream', 'Perch', 'Bream',
       'Pike', 'Perch', 'Pike', 'Roach', 'Bream', 'Roach', 'Whitefish',
       'Bream', 'Roach', 'Parkki', 'Smelt', 'Parkki', 'Bream', 'Parkki',
       'Roach', 'Roach', 'Perch', 'Bream', 'Roach', 'Pike', 'Smelt',
       'Pike', 'Bream', 'Perch', 'Perch', 'Perch', 'Perch', 'Roach',
       'Bream', 'Bream', 'Perch', 'Smelt', 'Bream', 'Bream', 'Bream',
       'Perch', 'Perch', 'Perch', 'Perch', 'Whitefish', 'Perch', 'Smelt',
       'Roach', 'Perch', 'Perch', 'Perch', 'Perch', 'Bream', 'Perch',
       'Bream', 'Pike', 'Bream', 'Perch', 'Smelt', 'Pike', 'Perch',
       'Smelt', 'Bream', 'Perch', 'Roach'], dtype=object)

In [30]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier
# Решите получившуюся задачу классификации с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# f1_score -> max

In [31]:
#Преобразовываем метки классов в целые числа
#https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
le = preprocessing.LabelEncoder()
y_test

array(['Bream', 'Parkki', 'Bream', 'Pike', 'Whitefish', 'Parkki', 'Roach',
       'Perch', 'Bream', 'Roach', 'Bream', 'Roach', 'Perch', 'Perch',
       'Perch', 'Perch', 'Perch', 'Perch', 'Pike', 'Roach', 'Pike',
       'Parkki', 'Perch', 'Roach', 'Perch', 'Pike', 'Smelt', 'Perch',
       'Bream', 'Perch', 'Perch', 'Roach'], dtype=object)

In [32]:
le.fit(y_train)
le.fit(y_val)
le.fit(y_test)

LabelEncoder()

In [33]:
y_train=le.transform(y_train)
y_val=le.transform(y_val)
y_test=le.transform(y_test)

In [34]:
y_train

array([1, 0, 2, 0, 2, 3, 4, 2, 2, 0, 2, 2, 1, 6, 6, 2, 0, 5, 2, 3, 5, 1,
       2, 0, 2, 2, 1, 5, 0, 0, 2, 5, 0, 2, 0, 3, 2, 3, 4, 0, 4, 6, 0, 4,
       1, 5, 1, 0, 1, 4, 4, 2, 0, 4, 3, 5, 3, 0, 2, 2, 2, 2, 4, 0, 0, 2,
       5, 0, 0, 0, 2, 2, 2, 2, 6, 2, 5, 4, 2, 2, 2, 2, 0, 2, 0, 3, 0, 2,
       5, 3, 2, 5, 0, 2, 4])

In [35]:
a_models=[
BaggingClassifier(), 
    RandomForestClassifier(),
    GradientBoostingClassifier(), 
    XGBClassifier()]

In [36]:
a_models=[]

#Регрессор Бэгинга
a_models.append(BaggingClassifier())

#случайный лес
a_models.append(RandomForestClassifier())

#градиентный бустинг
a_models.append(GradientBoostingClassifier())

#eXtrem Gradient Boosting
a_models.append(XGBClassifier(use_label_encoder=False,
                              eval_metric='mlogloss'))


In [37]:
# Обучаем модели
for model in a_models:
    print(model)
    model.fit(x_train, y_train)

BaggingClassifier()
RandomForestClassifier()
GradientBoostingClassifier()
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)


In [38]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model in a_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(model, '\t', f1)

BaggingClassifier() 	 0.6193204365079366
RandomForestClassifier() 	 0.5598095669427191
GradientBoostingClassifier() 	 0.6714409722222223
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...) 	 0.5897452731092437


In [39]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_a_model = a_models[i_max]
best_a_model

GradientBoostingClassifier()

In [40]:
y_test

array([0, 1, 0, 3, 6, 1, 4, 2, 0, 4, 0, 4, 2, 2, 2, 2, 2, 2, 3, 4, 3, 1,
       2, 4, 2, 3, 5, 2, 0, 2, 2, 4])

In [41]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_a_model.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.45982142857142855
