In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn import preprocessing
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score

In [2]:
# Генерируем уникальный seed
my_code = "Saskovets"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [3]:
# Читаем данные из файла
data = pd.read_csv("datasets/Fish.csv")

In [4]:
data

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


In [5]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(data))
print(val_test_size)

32


In [6]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

95 32 32


In [7]:
train

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
42,Roach,120.0,19.4,21.0,23.7,6.1146,3.2943
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
139,Pike,770.0,44.8,48.0,51.2,7.6800,5.3760
90,Perch,110.0,20.0,22.0,23.5,5.5225,3.9950
48,Roach,169.0,22.0,24.0,27.2,7.5344,3.8352
...,...,...,...,...,...,...,...
141,Pike,1250.0,52.0,56.0,59.7,10.6863,6.9849
72,Perch,5.9,7.5,8.4,8.8,2.1120,1.4080
59,Whitefish,800.0,33.7,36.4,39.6,11.7612,6.5736
31,Bream,955.0,35.0,38.5,44.0,18.0840,6.2920


In [8]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
num_columns = ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), num_columns)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Weight', 'Length1', 'Length2', 'Length3',
                                  'Height', 'Width'])])

In [9]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [10]:
sc_train

Unnamed: 0,0,1,2,3,4,5,6
0,0.075,0.245361,0.244186,0.269928,0.254588,0.332938,Roach
1,0.226875,0.387629,0.399225,0.447464,0.638566,0.505016,Bream
2,0.48125,0.769072,0.767442,0.768116,0.345449,0.641425,Pike
3,0.06875,0.257732,0.263566,0.266304,0.220221,0.436775,Perch
4,0.105625,0.298969,0.302326,0.333333,0.336998,0.413094,Roach
...,...,...,...,...,...,...,...
90,0.78125,0.917526,0.922481,0.922101,0.519944,0.879848,Pike
91,0.003688,0.0,0.0,0.0,0.022265,0.053408,Perch
92,0.5,0.540206,0.542636,0.557971,0.582334,0.818897,Whitefish
93,0.596875,0.56701,0.583333,0.637681,0.949328,0.777167,Bream


In [12]:
# Устанавливаем названия столбцов
column_names = num_columns + ['Species']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [13]:
# Явно укажем типы данных, это важно для xgboost
types = {
    'Weight' : 'float64',
    'Length1' : 'float64',
    'Length2' : 'float64',
    'Length3' : 'float64',
    'Height' : 'float64',
    'Width' : 'float64',
    'Species' : 'category'
}
sc_train = sc_train.astype(types)
sc_test = sc_test.astype(types)
sc_val = sc_val.astype(types)

In [11]:
# Задание №1 - анализ различных типов ансамблей решений в задаче регрессии
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [14]:
# Выбираем 4 числовых переменных, три их них будут предикторами, одна - зависимой переменной
n = 4
labels = random.sample(num_columns, n)

y_label = labels[0]
x_labels = labels[1:]

print(x_labels)
print(y_label)

['Length1', 'Length2', 'Width']
Length3


In [15]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [16]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, XGBRegressor.
# Решите получившуюся задачу регрессии с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# mean_squared_error -> min 

In [17]:
# Общий список моделей
r_models=[BaggingRegressor(), RandomForestRegressor(),GradientBoostingRegressor(), XGBRegressor()]

In [18]:
r_models=[]

#Регрессор Бэгинга
r_models.append(BaggingRegressor())

#случайный лес
r_models.append(RandomForestRegressor())

#градиентный бустинг
r_models.append(GradientBoostingRegressor())

#XGBregressor
r_models.append(XGBRegressor())

In [19]:
r_models

[BaggingRegressor(),
 RandomForestRegressor(),
 GradientBoostingRegressor(),
 XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)]

In [20]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_label])
y_test = np.ravel(sc_test[y_label])
y_val = np.ravel(sc_val[y_label])

In [21]:
y_train

array([0.26992754, 0.44746377, 0.76811594, 0.26630435, 0.33333333,
       0.17572464, 0.13405797, 0.62862319, 0.71014493, 0.65942029,
       0.30253623, 0.45652174, 0.34782609, 0.55253623, 0.22644928,
       0.57608696, 0.47101449, 0.66485507, 0.5       , 0.58152174,
       0.03623188, 0.29891304, 0.54347826, 0.49637681, 0.41304348,
       0.19927536, 0.29347826, 0.72282609, 0.27717391, 0.5942029 ,
       0.2173913 , 0.68478261, 0.25362319, 0.07789855, 0.24818841,
       0.67210145, 0.52536232, 0.51630435, 0.22101449, 0.83876812,
       0.28442029, 0.4057971 , 0.30797101, 0.07971014, 0.66485507,
       0.5923913 , 0.13043478, 0.58333333, 0.61050725, 0.2807971 ,
       0.38586957, 0.4692029 , 0.26086957, 0.05072464, 0.61050725,
       0.05072464, 0.25362319, 0.32065217, 1.        , 0.57427536,
       0.24818841, 0.39855072, 0.20652174, 0.60688406, 0.1884058 ,
       0.13949275, 0.68297101, 0.57608696, 0.54166667, 0.45652174,
       0.66485507, 0.27898551, 0.61231884, 0.15217391, 0.37137

In [22]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

BaggingRegressor()
RandomForestRegressor()
GradientBoostingRegressor()
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


In [23]:
# Оцениваем качество работы моделей на валидационной выборке.
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

BaggingRegressor() 	 0.0008733650116178313
RandomForestRegressor() 	 0.0011085279586910612
GradientBoostingRegressor() 	 0.0006314707753260445
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 	 0.0009809911498245643


In [24]:
# Выбираем лучшую модель
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

GradientBoostingRegressor()

In [25]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.00040286237843360165


In [26]:
# Задание №2 - анализ различных типов ансамблей в задаче классификации
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [27]:
# Выбираем 2 числовых переменных, которые будут параметрами элементов набора данных
# Метка класса всегда 'Species'
n = 2
x_labels = random.sample(num_columns, n)
y_label = 'Species'

print(x_labels)
print(y_label)

['Width', 'Length3']
Species


In [28]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [29]:
y_train

0         Roach
1         Bream
2          Pike
3         Perch
4         Roach
        ...    
90         Pike
91        Perch
92    Whitefish
93        Bream
94        Perch
Name: Species, Length: 95, dtype: category
Categories (7, object): ['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish']

In [30]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier
# Решите получившуюся задачу классификации с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# f1_score -> max

In [31]:
le = preprocessing.LabelEncoder()
y_test

0         Roach
1         Perch
2         Perch
3         Perch
4         Bream
5         Roach
6         Bream
7         Smelt
8         Bream
9         Bream
10        Roach
11         Pike
12        Roach
13        Perch
14        Bream
15        Bream
16        Perch
17        Bream
18        Roach
19        Perch
20        Bream
21        Perch
22        Roach
23        Smelt
24        Roach
25        Perch
26        Perch
27        Bream
28    Whitefish
29        Perch
30        Smelt
31         Pike
Name: Species, dtype: category
Categories (6, object): ['Bream', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish']

In [32]:
le.fit(y_train)
le.fit(y_val)
le.fit(y_test)

LabelEncoder()

In [33]:
y_train=le.transform(y_train)
y_val=le.transform(y_val)
y_test=le.transform(y_test)

ValueError: y contains previously unseen labels: 'Parkki'

In [None]:
#Паркки не существует, он не может тебе навредить. 
#Паркки: OHOHO... YES.