In [1]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingRegressor, BaggingClassifier, RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn import preprocessing
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score

In [2]:
# Генерируем уникальный seed
my_code = "Maslixin"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [3]:
# Читаем данные из файла
example_data = pd.read_csv("datasets/Fish.csv")

In [4]:
example_data.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [5]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

32


In [6]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

95 32 32


In [7]:
train

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
54,Roach,390.0,29.5,31.7,35.0,9.4850,5.3550
115,Perch,690.0,34.6,37.0,39.3,10.5717,6.3666
48,Roach,169.0,22.0,24.0,27.2,7.5344,3.8352
82,Perch,110.0,19.0,21.0,22.5,5.6925,3.5550
40,Roach,0.0,19.0,20.5,22.8,6.4752,3.3516
...,...,...,...,...,...,...,...
73,Perch,32.0,12.5,13.7,14.7,3.5280,1.9992
133,Pike,345.0,36.0,38.5,41.0,6.3960,3.9770
10,Bream,475.0,28.4,31.0,36.2,14.2628,5.1042
70,Parkki,273.0,23.0,25.0,28.0,11.0880,4.1440


In [8]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
num_columns = ['Weight', 'Length1', 'Length2', 'Length3', 'Height', 'Width']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), num_columns)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['Weight', 'Length1', 'Length2', 'Length3',
                                  'Height', 'Width'])])

In [9]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [10]:
sc_train

Unnamed: 0,0,1,2,3,4,5,6
0,0.24375,0.432548,0.436255,0.454887,0.455579,0.607155,Roach
1,0.43125,0.541756,0.541833,0.535714,0.519406,0.749746,Perch
2,0.105625,0.271949,0.282869,0.308271,0.341012,0.39293,Roach
3,0.06875,0.207709,0.223108,0.219925,0.232829,0.353434,Perch
4,0.0,0.207709,0.213147,0.225564,0.2788,0.324763,Roach
...,...,...,...,...,...,...,...
90,0.02,0.068522,0.077689,0.073308,0.105698,0.134134,Perch
91,0.215625,0.571734,0.571713,0.567669,0.274149,0.412917,Pike
92,0.296875,0.408994,0.422311,0.477444,0.7362,0.571803,Bream
93,0.170625,0.293362,0.302789,0.323308,0.54973,0.436457,Parkki


In [11]:
# Устанавливаем названия столбцов
column_names = num_columns + ['Species']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [12]:
# Явно укажем типы данных, это важно для xgboost
types = {
    'Weight' : 'float64',
    'Length1' : 'float64',
    'Length2' : 'float64',
    'Length3' : 'float64',
    'Height' : 'float64',
    'Width' : 'float64',
    'Species' : 'category'
}
sc_train = sc_train.astype(types)
sc_test = sc_test.astype(types)
sc_val = sc_val.astype(types)

In [13]:
# Задание №1 - анализ различных типов ансамблей решений в задаче регрессии
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [14]:
# Выбираем 4 числовых переменных, три их них будут предикторами, одна - зависимой переменной
n = 4
labels = random.sample(num_columns, n)

y_label = labels[0]
x_labels = labels[1:]

print(x_labels)
print(y_label)

['Length1', 'Weight', 'Height']
Length2


In [15]:
# Отберем необходимые параметры
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [16]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, XGBRegressor.
# Решите получившуюся задачу регрессии с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# mean_squared_error -> min 

In [17]:
r_models=[BaggingRegressor(), RandomForestRegressor(),GradientBoostingRegressor(), XGBRegressor()]


In [18]:
r_models=[]

r_models.append(BaggingRegressor())

r_models.append(RandomForestRegressor())

r_models.append(GradientBoostingRegressor())

r_models.append(XGBRegressor())


In [19]:
r_models

[BaggingRegressor(),
 RandomForestRegressor(),
 GradientBoostingRegressor(),
 XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              validate_parameters=None, verbosity=None)]

In [20]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_label])
y_test = np.ravel(sc_test[y_label])
y_val = np.ravel(sc_val[y_label])


In [21]:
y_train

array([0.43625498, 0.54183267, 0.28286853, 0.22310757, 0.21314741,
       1.        , 0.30278884, 0.01593625, 0.64143426, 0.33266932,
       0.20318725, 0.67131474, 0.02390438, 0.35258964, 0.21713147,
       1.        , 0.26294821, 0.24302789, 0.41633466, 0.27290837,
       0.48207171, 0.50199203, 0.59163347, 0.32270916, 0.08565737,
       0.70119522, 0.        , 0.15737052, 0.62151394, 0.33266932,
       0.16733068, 0.60159363, 0.28286853, 0.20318725, 0.28286853,
       0.15139442, 0.66135458, 0.6812749 , 0.31075697, 0.76095618,
       0.32270916, 0.25697211, 0.09760956, 0.0438247 , 0.27091633,
       0.60159363, 0.44820717, 0.16334661, 0.02788845, 0.17729084,
       0.32868526, 0.40239044, 0.03984064, 0.25298805, 0.83466135,
       0.60159363, 0.50199203, 0.18326693, 0.58167331, 0.27290837,
       0.05179283, 0.40239044, 0.65139442, 0.54183267, 0.52191235,
       0.42231076, 0.29482072, 0.47211155, 0.58167331, 0.31474104,
       0.18326693, 0.06374502, 0.12749004, 0.3625498 , 0.24302

In [22]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

BaggingRegressor()
RandomForestRegressor()
GradientBoostingRegressor()
XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=None, num_parallel_tree=None,
             predictor=None, random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)


In [23]:
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

BaggingRegressor() 	 0.00022532876462278342
RandomForestRegressor() 	 0.00016400889221996414
GradientBoostingRegressor() 	 0.0002099304248316422
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None) 	 0.0003203511809131044


In [24]:
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

RandomForestRegressor()

In [25]:
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.0004495564430524924


In [26]:
# Задание №2 - анализ различных типов ансамблей в задаче классификации
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
# https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [27]:
# Выбираем 2 числовых переменных, которые будут параметрами элементов набора данных
# Метка класса всегда 'Species'
n = 2
x_labels = random.sample(num_columns, n)
y_label = 'Species'

print(x_labels)
print(y_label)

['Width', 'Weight']
Species


In [28]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [29]:
x_train

Unnamed: 0,Width,Weight
0,0.607155,0.243750
1,0.749746,0.431250
2,0.392930,0.105625
3,0.353434,0.068750
4,0.324763,0.000000
...,...,...
90,0.134134,0.020000
91,0.412917,0.215625
92,0.571803,0.296875
93,0.436457,0.170625


In [30]:
# Создайте 4 различных модели с использованием следующих классов:
# BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, XGBClassifier
# Решите получившуюся задачу классификации с помощью созданных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

# f1_score -> max

In [31]:
le = preprocessing.LabelEncoder()
y_test

0         Perch
1         Roach
2         Roach
3         Roach
4         Perch
5        Parkki
6         Roach
7         Perch
8         Perch
9          Pike
10    Whitefish
11        Bream
12        Perch
13        Bream
14         Pike
15        Bream
16        Bream
17         Pike
18        Perch
19         Pike
20        Perch
21    Whitefish
22        Bream
23        Perch
24        Perch
25        Perch
26        Bream
27        Perch
28        Bream
29        Bream
30        Smelt
31        Perch
Name: Species, dtype: category
Categories (7, object): ['Bream', 'Parkki', 'Perch', 'Pike', 'Roach', 'Smelt', 'Whitefish']

In [32]:
le.fit(y_train)
le.fit(y_val)
le.fit(y_test)

LabelEncoder()

In [33]:
y_train=le.transform(y_train)
y_val=le.transform(y_val)
y_test=le.transform(y_test)

In [34]:
y_train

array([4, 2, 4, 2, 4, 3, 2, 5, 2, 6, 4, 2, 5, 2, 1, 3, 1, 2, 0, 4, 3, 0,
       0, 1, 4, 3, 5, 1, 0, 2, 4, 2, 2, 1, 2, 2, 3, 2, 0, 3, 6, 2, 1, 5,
       4, 2, 3, 2, 5, 2, 0, 0, 5, 4, 3, 2, 0, 2, 2, 2, 5, 2, 3, 2, 0, 6,
       2, 0, 2, 2, 1, 5, 2, 2, 4, 4, 2, 0, 0, 4, 2, 1, 0, 5, 2, 2, 2, 2,
       0, 3, 2, 3, 0, 1, 4])

In [35]:
a_models=[BaggingClassifier(),     
          RandomForestClassifier(),    
          GradientBoostingClassifier(),    
          XGBClassifier()]


In [36]:
a_models=[]

a_models.append(BaggingClassifier())

a_models.append(RandomForestClassifier())

a_models.append(GradientBoostingClassifier())

a_models.append(XGBClassifier(use_label_encoder=False,
                              eval_metric='mlogloss'))

In [37]:
for model in a_models:
    print(model)
    model.fit(x_train, y_train)

BaggingClassifier()
RandomForestClassifier()
GradientBoostingClassifier()
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, eval_metric='mlogloss', gamma=None,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)


In [38]:
f1s = []
for model in a_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(model, '\t', f1)

BaggingClassifier() 	 0.6622282608695653
RandomForestClassifier() 	 0.5372176044330776
GradientBoostingClassifier() 	 0.5494791666666666
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, ...) 	 0.5452380952380952


In [39]:
i_max = f1s.index(max(f1s))
best_a_model = a_models[i_max]
best_a_model

BaggingClassifier()

In [40]:
test_pred = best_a_model.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.4946633825944171
