In [1]:
# Задание на повторение материала предыдущего семестра

In [2]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score, silhouette_score

In [3]:
# Генерируем уникальный seed
my_code = "Petrov"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [4]:
# Данные загружены отсюда: https://www.kaggle.com/dwdkills/russian-demography
# Читаем данные из файла
example_data = pd.read_csv("datasets/russian_demography.csv")

In [5]:
# "year" - год (1990-2017)
# "region" - название региона
# "npg" - естественный прирост населения на 1000 человек
# "birth_rate" - количество рождений на 1000 человек
# "death_rate" - количество смертей на 1000 человек
# "gdw" - коэффициент демографической нагрузки на 100 человек (Отношение числа нетрудоспособных к числу трудоспособных).
# "urbanization" - процент городского населения

example_data.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01


In [6]:
# Так как список регионов меняется от года к году, в данных есть строки без значений. Удалим их
example_data.dropna(inplace=True)

In [7]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

463


In [8]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

1389 463 463


In [9]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = ['year', 'npg', 'birth_rate', 'death_rate', 'gdw', 'urbanization']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['year', 'npg', 'birth_rate', 'death_rate',
                                  'gdw', 'urbanization'])])

In [10]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [11]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['region']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [12]:
sc_train

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization,region
0,0.62963,0.4914,0.419913,0.373272,0.056796,0.539534,Chukotka Autonomous Okrug
1,0.481481,0.068796,0.077922,0.820276,0.490768,0.558348,Leningrad Oblast
2,0.814815,0.324324,0.229437,0.506912,0.481976,0.655309,Ulyanovsk Oblast
3,0.518519,0.309582,0.329004,0.640553,0.373483,0.526378,Zabaykalsky Krai
4,0.259259,0.452088,0.264069,0.304147,0.723228,0.204841,Republic of Kalmykia
...,...,...,...,...,...,...,...
1384,0.740741,0.565111,0.415584,0.267281,0.339019,0.472438,Tyumen Oblast
1385,0.222222,0.149877,0.047619,0.640553,0.70793,0.743718,Yaroslavl Oblast
1386,0.814815,0.321867,0.277056,0.562212,0.506594,0.715827,Republic of Karelia
1387,0.111111,0.14742,0.04329,0.640553,0.730614,0.752269,Yaroslavl Oblast


In [13]:
# Вспоминаем алгоритмы решения задачи регрессии: линейную регрессию и метод k ближайших соседей
r_models = []

# Линейная регрессия
# Для использования регуляризации, вместо LinearRegression используем Lasso, Ridge или ElasticNet
# Параметр alpha - коэффициент регуляризации для Lasso и Ridge, по умолчанию равен 1
# Для ElasticNet, если регуляризация иммет вид a*L1+b*L2, то
# параметр alpha = a + b, по умолчанию равен 1
# параметр l1_ratio = a / (a + b), по умолчанию равен 0.5
r_models.append(LinearRegression())
r_models.append(Lasso(alpha=1.0))
r_models.append(Ridge(alpha=1.0))
r_models.append(ElasticNet(alpha=1.0, l1_ratio=0.5))

# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
r_models.append(KNeighborsRegressor(n_neighbors=5))
r_models.append(KNeighborsRegressor(n_neighbors=10))
r_models.append(KNeighborsRegressor(n_neighbors=15))

In [14]:
# Выделим предикторы и зависимую переменную
x_labels = column_names[0:-2]
y_labels = ['urbanization']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_labels]
y_test = sc_test[y_labels]
y_val = sc_val[y_labels]

In [15]:
# Обучаем модели
for model in r_models:
    model.fit(x_train, y_train)

In [16]:
# Оценииваем качество работы моделей на валидационной выборке.
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(mse)

0.019198901844257647
0.028227256064330873
0.01912859829628885
0.028227256064330873
0.01019631994975206
0.011237396728918717
0.012169188952861959


In [17]:
# Выбираем лучшую модель
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

KNeighborsRegressor()

In [18]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.011241728626299753


In [19]:
# Вспоминаем алгоритмы решения задачи классификации:
# логистическую регрессию, наивный байесовский классификатор и (снова) метод k ближайших соседей
c_models = []

# Логистическая регрессия
# Параметр penalty - тип регуляризации: 'l1', 'l2', 'elasticnet', 'none'}, по умолчанию 'l2'
# Для некоторых типов регуляризации доступны не все алгоритмы (параметр solver)
# Для elasticnet регуляризации необходимо уазывать параметр l1_ratio (0 - l2, 1 - l1)
c_models.append(LogisticRegression(penalty='none', solver='saga'))
c_models.append(LogisticRegression(penalty='l1', solver='saga'))
c_models.append(LogisticRegression(penalty='l2', solver='saga'))
c_models.append(LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga'))
c_models.append(LogisticRegression())

# Наивный байесовский классификатор
# Параметр alpha - параметр сглаживания, по умолчанию равен 1 (сглаживание Лапласа)
c_models.append(MultinomialNB(alpha=0.0))
c_models.append(MultinomialNB(alpha=0.5))
c_models.append(MultinomialNB(alpha=1.0))

# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
c_models.append(KNeighborsClassifier(n_neighbors=5))
c_models.append(KNeighborsClassifier(n_neighbors=10))
c_models.append(KNeighborsClassifier(n_neighbors=15))

In [20]:
# Выделим предикторы и метки классов
x_labels = column_names[0:-1]
y_labels = ['region']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [21]:
# Обучаем модели
for model in c_models:
    model.fit(x_train, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)


In [22]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model in c_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(f1)

0.27784232533060677
0.12062183210661664
0.04938597503347674
0.07145963299917804
0.04938597503347674
0.024540757807712314
0.025216509292861844
0.025209532162023117
0.34240836948133413
0.2549576730626085
0.22471333270080235


In [23]:
# Выбираем лучшую модель
i_min = f1s.index(min(f1s))
best_c_model = c_models[i_min]
best_c_model

MultinomialNB(alpha=0.0)

In [24]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_c_model.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.024939096761857658


In [25]:
# Вспоминаем алгоритм решения задачи кластеризации - метод k-средних
# Параметр n_clusters - количество кластеров, по умолчанию равен 8
k_models = []
k_models.append(KMeans(n_clusters=5))
k_models.append(KMeans(n_clusters=8))
k_models.append(KMeans(n_clusters=20))
k_models.append(KMeans(n_clusters=50))

In [26]:
# Выделим используемые параметры
x_labels = column_names[0:-1]
x = pd.concat([sc_train[x_labels], sc_val[x_labels], sc_test[x_labels]])
x

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization
0,0.62963,0.4914,0.419913,0.373272,0.056796,0.539534
1,0.481481,0.068796,0.077922,0.820276,0.490768,0.558348
2,0.814815,0.324324,0.229437,0.506912,0.481976,0.655309
3,0.518519,0.309582,0.329004,0.640553,0.373483,0.526378
4,0.259259,0.452088,0.264069,0.304147,0.723228,0.204841
...,...,...,...,...,...,...
458,0.740741,0.643735,0.415584,0.105991,0.069808,0.798711
459,0.185185,0.324324,0.112554,0.382488,0.193599,0.810946
460,0.925926,0.262899,0.21645,0.608295,0.688236,0.556637
461,0.074074,0.250614,0.121212,0.529954,0.788113,0.554138


In [27]:
# Произведем кластеризацию
for model in k_models:
    model.fit(x)

In [28]:
# Оценим качество результата
sils = []
for model in k_models:
    cluster_labels = model.predict(x)
    s = silhouette_score(x, cluster_labels)
    sils.append(s)
    print(s)

0.29486318470150064
0.24697863002765352
0.23951622997150043
0.23159851012816435


In [29]:
# Выбираем лучшую модель
i_min = sils.index(min(sils))
best_k_model = k_models[i_min]
print(best_k_model)
print(sils[i_min])



KMeans(n_clusters=50)
0.23159851012816435


In [30]:
# Задание №1 - анализ моделей для задачи регрессии
# Общий список моделей
r_models = [
    LinearRegression(),
    Lasso(alpha=1.0),
    Lasso(alpha=0.5),
    Ridge(alpha=1.0),
    Ridge(alpha=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.25),
    ElasticNet(alpha=1.0, l1_ratio=0.75),
    ElasticNet(alpha=0.5, l1_ratio=0.5),
    ElasticNet(alpha=0.5, l1_ratio=0.25),
    ElasticNet(alpha=0.5, l1_ratio=0.75),
    KNeighborsRegressor(n_neighbors=5),
    KNeighborsRegressor(n_neighbors=10),
    KNeighborsRegressor(n_neighbors=15),
    KNeighborsRegressor(n_neighbors=20),
    KNeighborsRegressor(n_neighbors=25)
]

In [31]:
# Выбор моделей для задания
n = 4
random.seed(my_seed)
my_models1 = random.sample(r_models, n)
print(my_models1)

[Lasso(alpha=0.5), KNeighborsRegressor(n_neighbors=25), KNeighborsRegressor(n_neighbors=20), ElasticNet()]


In [32]:
# Загрузим данные для задачи регрессии
data = pd.read_csv("datasets/weather.csv")
data

Unnamed: 0,water_level,precipitation,temperature,humidity,visibility,wind,weather,pressure,fire,wl_change,temp_change,pressure_change
0,4680,00,249,310,1000,40,00,9938,40,-30,-03,-04
1,1400,00,279,220,200,10,00,10041,40,-20,06,-10
2,4640,00,256,280,200,20,10,10012,40,-70,-18,-03
3,4670,00,267,260,1000,30,00,9923,40,-10,12,-13
4,1380,00,291,220,200,10,00,10013,40,-20,25,-13
...,...,...,...,...,...,...,...,...,...,...,...,...
1434,4740,01,64,950,40,10,30,9999,40,-10,59,15
1435,650,00,64,930,100,40,30,9839,40,-10,52,31
1436,1590,01,85,830,100,20,10,9697,40,10,14,19
1437,990,01,75,910,100,30,30,9954,40,-40,27,17


In [33]:
data['water_level'] = data['water_level'].str.replace(',','.')
data['precipitation'] = data['precipitation'].str.replace(',','.')
data['temperature'] = data['temperature'].str.replace(',','.')
data['humidity'] = data['humidity'].str.replace(',','.')
data['visibility'] = data['visibility'].str.replace(',','.')
data['wind'] = data['wind'].str.replace(',','.')
data['weather'] = data['weather'].str.replace(',','.')
data['pressure'] = data['pressure'].str.replace(',','.')
data['fire'] = data['fire'].str.replace(',','.')                       
data['wl_changer'] = data['wl_change'].str.replace(',','.')                       
data['temp_change'] = data['temp_change'].str.replace(',','.')                      
data['pressure_change'] = data['pressure_change'].str.replace(',','.')                         

In [34]:
data

Unnamed: 0,water_level,precipitation,temperature,humidity,visibility,wind,weather,pressure,fire,wl_change,temp_change,pressure_change,wl_changer
0,468.0,0.0,24.9,31.0,100.0,4.0,0.0,993.8,4.0,-30,-0.3,-0.4,-3.0
1,140.0,0.0,27.9,22.0,20.0,1.0,0.0,1004.1,4.0,-20,0.6,-1.0,-2.0
2,464.0,0.0,25.6,28.0,20.0,2.0,1.0,1001.2,4.0,-70,-1.8,-0.3,-7.0
3,467.0,0.0,26.7,26.0,100.0,3.0,0.0,992.3,4.0,-10,1.2,-1.3,-1.0
4,138.0,0.0,29.1,22.0,20.0,1.0,0.0,1001.3,4.0,-20,2.5,-1.3,-2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1434,474.0,0.1,6.4,95.0,4.0,1.0,3.0,999.9,4.0,-10,5.9,1.5,-1.0
1435,65.0,0.0,6.4,93.0,10.0,4.0,3.0,983.9,4.0,-10,5.2,3.1,-1.0
1436,159.0,0.1,8.5,83.0,10.0,2.0,1.0,969.7,4.0,10,1.4,1.9,1.0
1437,99.0,0.1,7.5,91.0,10.0,3.0,3.0,995.4,4.0,-40,2.7,1.7,-4.0


In [35]:
# Зависимая переменная для всех одна и та же. Предикторы выбираем случайнм образом.
columns = list(data.columns)
n_x = 5

y_label = ['water_level']
x_labels = random.sample(columns[1:], n_x)

print(x_labels)

['visibility', 'pressure', 'pressure_change', 'wind', 'humidity']


In [36]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу регрессии с помощью выбранных моделей и сравните их эффективность.


In [37]:
all_lb =  x_labels + y_label

In [38]:

data=data[list(all_lb)]
data

Unnamed: 0,visibility,pressure,pressure_change,wind,humidity,water_level
0,100.0,993.8,-0.4,4.0,31.0,468.0
1,20.0,1004.1,-1.0,1.0,22.0,140.0
2,20.0,1001.2,-0.3,2.0,28.0,464.0
3,100.0,992.3,-1.3,3.0,26.0,467.0
4,20.0,1001.3,-1.3,1.0,22.0,138.0
...,...,...,...,...,...,...
1434,4.0,999.9,1.5,1.0,95.0,474.0
1435,10.0,983.9,3.1,4.0,93.0,65.0
1436,10.0,969.7,1.9,2.0,83.0,159.0
1437,10.0,995.4,1.7,3.0,91.0,99.0


In [39]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(data))
print(val_test_size)

288


In [40]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

863 288 288


In [41]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = all_lb

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['visibility', 'pressure', 'pressure_change',
                                  'wind', 'humidity', 'water_level'])])

In [42]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [43]:
# Устанавливаем названия столбцов
column_names = all_lb
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [44]:
sc_train

Unnamed: 0,visibility,pressure,pressure_change,wind,humidity,water_level
0,0.2,0.743764,0.329787,0.444444,0.461538,0.546837
1,0.2,0.767952,0.372340,0.000000,0.934066,0.183347
2,0.2,0.907785,0.255319,0.111111,0.329670,0.260208
3,1.0,0.105820,0.436170,0.000000,0.791209,0.279424
4,0.1,0.763416,0.074468,0.222222,0.120879,0.487590
...,...,...,...,...,...,...
858,0.2,0.749811,0.468085,0.111111,0.912088,0.301041
859,0.1,0.701436,0.351064,0.444444,0.714286,0.500400
860,0.1,0.699169,0.148936,0.111111,0.417582,0.534828
861,0.2,0.689342,0.638298,0.444444,0.758242,0.327462


In [45]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = sc_train[y_label]
y_test = sc_test[y_label]
y_val = sc_val[y_label]

In [46]:
# Обучаем модели
for model in my_models1:
    model.fit(x_train, y_train)

In [47]:
# Оценииваем качество работы моделей на валидационной выборке.
mses = []
for model in my_models1:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(mse)

0.03199040259824459
0.028970391372256234
0.02927605431133448
0.03199040259824459


In [48]:
# Выбираем лучшую модель # Укажите, какая модель решает задачу лучше других.
i_min = mses.index(min(mses))
best_my_model = my_models1[i_min]
print('Модель ', best_my_model, 'решает задачу лучше других.') 

Модель  KNeighborsRegressor(n_neighbors=25) решает задачу лучше других.


In [49]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.07644367127861225


In [50]:
# Задание №2 - анализ моделей для задачи классификации
# Общий список моделей
c_models = [
    LogisticRegression(penalty='none', solver='saga'),
    LogisticRegression(penalty='l1', solver='saga'),
    LogisticRegression(penalty='l2', solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.25, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.75, solver='saga'),
    LogisticRegression(),
    MultinomialNB(alpha=0.0),
    MultinomialNB(alpha=0.25),
    MultinomialNB(alpha=0.5),
    MultinomialNB(alpha=0.75),
    MultinomialNB(alpha=1.0),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=10),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=20),
    KNeighborsClassifier(n_neighbors=25)
]

In [51]:
# Выбор моделей для задания
n = 5
my_models2 = random.sample(c_models, n)
print(my_models2)

[MultinomialNB(alpha=0.25), KNeighborsClassifier(n_neighbors=20), KNeighborsClassifier(), LogisticRegression(solver='saga'), LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')]


In [52]:
# Загрузим данные для задачи классификации
data = pd.read_csv("datasets/zoo2.csv")
data

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,turtle,0,0,1,0,0,1,0,0,1,1,0,0,4,1,1,1,3
1,chameleon,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
2,iguana,0,0,1,0,0,0,1,1,1,1,0,0,4,1,1,1,3
3,lizard,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,3
4,gecko,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
5,python,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
6,boa,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,1,3
7,adder,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
8,crocodile,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3
9,alligator,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3


In [53]:
#data.drop(['animal_name'], axis='columns', inplace=True)

In [54]:
#data.head()

In [55]:
data['animal_name'] = pd.get_dummies(data['animal_name'])
data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,0,0,0,1,0,0,1,0,0,1,1,0,0,4,1,1,1,3
1,0,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
2,0,0,0,1,0,0,0,1,1,1,1,0,0,4,1,1,1,3
3,0,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,3
4,0,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3


In [56]:
# Метка класса для всех одна и та же. Параметры выбираем случайнм образом.
columns = list(data.columns)
n_x = 8

y_label = ['class_type']
x_labels = random.sample(columns[:-1], n_x)
all_lb1 =  x_labels + y_label
print(x_labels)

['fins', 'catsize', 'feathers', 'domestic', 'backbone', 'aquatic', 'animal_name', 'milk']


In [57]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу классификации с помощью выбранных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

In [58]:
data=data[list(all_lb1)]
data.head()

Unnamed: 0,fins,catsize,feathers,domestic,backbone,aquatic,animal_name,milk,class_type
0,0,1,0,1,1,1,0,0,3
1,0,0,0,1,1,0,0,0,3
2,0,1,0,1,1,0,0,0,3
3,0,0,0,0,1,0,0,0,3
4,0,0,0,1,1,0,0,0,3


In [59]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(data))
print(val_test_size)

9


In [60]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

25 9 9


In [61]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = x_labels#all_lb1

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['fins', 'catsize', 'feathers', 'domestic',
                                  'backbone', 'aquatic', 'animal_name',
                                  'milk'])])

In [62]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [63]:
# Устанавливаем названия столбцов
column_names = all_lb1
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [64]:
sc_train

Unnamed: 0,fins,catsize,feathers,domestic,backbone,aquatic,animal_name,milk,class_type
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
2,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
3,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0
4,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,3.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
6,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,4.0
7,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0
8,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0


In [65]:
x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_label])
y_test = np.ravel(sc_test[y_label])
y_val = np.ravel(sc_val[y_label])

In [66]:
# Обучаем модели
for model2 in my_models2:
    model2.fit(x_train, y_train)

In [67]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model in my_models2:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(f1)

0.4444444444444444
0.0808080808080808
0.8412698412698413
0.8412698412698413
0.8412698412698413


In [68]:
# Выбираем лучшую модель # Укажите, какая модель решает задачу лучше других.
i_min = f1s.index(min(f1s))
best_my_models2 = my_models2[i_min]
print('Модель ', best_my_models2, 'решает задачу лучше других.') 

Модель  KNeighborsClassifier(n_neighbors=20) решает задачу лучше других.


In [69]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_my_models2.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.024691358024691357
