In [1]:
# Задание на повторение материала предыдущего семестра

In [2]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score, silhouette_score

In [3]:
# Генерируем уникальный seed
my_code = "Магдиев"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [4]:
# Задание №1 - анализ моделей для задачи регрессии
# Общий список моделей
r_models = [
    LinearRegression(),
    Lasso(alpha=1.0),
    Lasso(alpha=0.5),
    Ridge(alpha=1.0),
    Ridge(alpha=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.25),
    ElasticNet(alpha=1.0, l1_ratio=0.75),
    ElasticNet(alpha=0.5, l1_ratio=0.5),
    ElasticNet(alpha=0.5, l1_ratio=0.25),
    ElasticNet(alpha=0.5, l1_ratio=0.75),
    KNeighborsRegressor(n_neighbors=5),
    KNeighborsRegressor(n_neighbors=10),
    KNeighborsRegressor(n_neighbors=15),
    KNeighborsRegressor(n_neighbors=20),
    KNeighborsRegressor(n_neighbors=25),
    SVR(kernel='linear'),
    SVR(kernel='poly'),
    SVR(kernel='rbf'),
    SVR(kernel='sigmoid'),
    DecisionTreeRegressor(criterion='squared_error'),
    DecisionTreeRegressor(criterion='friedman_mse'),
    DecisionTreeRegressor(criterion='absolute_error')
]

In [5]:
# Выбор моделей для задания
n = 4
random.seed(my_seed)
my_models = random.sample(r_models, n)
print(my_models)

[Ridge(), SVR(), ElasticNet(alpha=0.5), ElasticNet(alpha=0.5, l1_ratio=0.75)]


In [6]:
# Загрузим данные для задачи регрессии
data = pd.read_csv("datasets/weather.csv", decimal=',')
data

Unnamed: 0,water_level,precipitation,temperature,humidity,visibility,wind,weather,pressure,fire,wl_change,temp_change,pressure_change
0,468.0,0.0,24.9,31.0,100.0,4.0,0.0,993.8,4.0,-3.0,-0.3,-0.4
1,140.0,0.0,27.9,22.0,20.0,1.0,0.0,1004.1,4.0,-2.0,0.6,-1.0
2,464.0,0.0,25.6,28.0,20.0,2.0,1.0,1001.2,4.0,-7.0,-1.8,-0.3
3,467.0,0.0,26.7,26.0,100.0,3.0,0.0,992.3,4.0,-1.0,1.2,-1.3
4,138.0,0.0,29.1,22.0,20.0,1.0,0.0,1001.3,4.0,-2.0,2.5,-1.3
...,...,...,...,...,...,...,...,...,...,...,...,...
1434,474.0,0.1,6.4,95.0,4.0,1.0,3.0,999.9,4.0,-1.0,5.9,1.5
1435,65.0,0.0,6.4,93.0,10.0,4.0,3.0,983.9,4.0,-1.0,5.2,3.1
1436,159.0,0.1,8.5,83.0,10.0,2.0,1.0,969.7,4.0,1.0,1.4,1.9
1437,99.0,0.1,7.5,91.0,10.0,3.0,3.0,995.4,4.0,-4.0,2.7,1.7


In [7]:
# Зависимая переменная для всех одна и та же. Предикторы выбираем случайнм образом.
columns = list(data.columns)
n_x = 5

y_label = 'water_level'
x_labels = random.sample(columns[1:], n_x)

print(x_labels)

['visibility', 'wind', 'humidity', 'pressure', 'wl_change']


In [8]:
data = data.drop(['precipitation','temperature','weather','fire','temp_change','pressure_change','pressure_change'],axis=1)


In [9]:
data

Unnamed: 0,water_level,humidity,visibility,wind,pressure,wl_change
0,468.0,31.0,100.0,4.0,993.8,-3.0
1,140.0,22.0,20.0,1.0,1004.1,-2.0
2,464.0,28.0,20.0,2.0,1001.2,-7.0
3,467.0,26.0,100.0,3.0,992.3,-1.0
4,138.0,22.0,20.0,1.0,1001.3,-2.0
...,...,...,...,...,...,...
1434,474.0,95.0,4.0,1.0,999.9,-1.0
1435,65.0,93.0,10.0,4.0,983.9,-1.0
1436,159.0,83.0,10.0,2.0,969.7,1.0
1437,99.0,91.0,10.0,3.0,995.4,-4.0


In [10]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу регрессии с помощью выбранных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

In [11]:
data.shape

(1439, 6)

In [12]:
val_test_size = round(0.2*len(data))
print(val_test_size)

288


In [13]:
random_state = my_seed
train_val, test = train_test_split(data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

863 288 288


In [14]:
train.head()

Unnamed: 0,water_level,humidity,visibility,wind,pressure,wl_change
1030,126.0,95.0,4.0,1.0,964.8,-1.0
1072,82.0,58.0,20.0,3.0,984.0,-2.0
1130,300.0,84.0,20.0,5.0,991.5,2.0
781,526.0,78.0,100.0,3.0,963.0,6.0
169,143.0,96.0,0.0,1.0,988.1,18.0


In [15]:
columns_to_scale = ['visibility', 'wind', 'humidity', 'pressure', 'wl_change']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['visibility', 'wind', 'humidity', 'pressure',
                                  'wl_change'])])

In [16]:
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [17]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['water_level']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [18]:
sc_val

Unnamed: 0,visibility,wind,humidity,pressure,wl_change,water_level
0,0.04,0.0,0.912088,0.755656,0.383495,451.0
1,0.10,0.0,0.923077,0.644796,0.388350,211.0
2,0.10,0.0,0.868132,0.653846,0.344660,41.0
3,0.20,0.1,0.769231,0.767722,0.388350,343.0
4,0.20,0.3,0.263736,0.664404,0.495146,293.0
...,...,...,...,...,...,...
283,0.20,0.3,0.472527,0.773756,0.320388,324.0
284,0.20,0.1,0.758242,0.789593,0.383495,78.0
285,0.20,0.5,0.395604,0.576169,0.373786,41.0
286,0.20,0.4,0.263736,0.720965,0.276699,389.0


In [19]:
r_models = []

# Линейная регрессия
r_models.append(ElasticNet(alpha=0.5))
r_models.append(ElasticNet(alpha=0.5, l1_ratio=0.75))

# Эпсилон-регрессия опорных векторов
r_models.append(SVR())

#Линейная регрессия
r_models.append(Ridge())



In [20]:
r_models

[ElasticNet(alpha=0.5), ElasticNet(alpha=0.5, l1_ratio=0.75), SVR(), Ridge()]

In [21]:
x_labels = column_names[0:-1]
y_labels = ['water_level']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [22]:
x_labels

['visibility', 'wind', 'humidity', 'pressure', 'wl_change']

In [23]:
y_labels

['water_level']

In [24]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

ElasticNet(alpha=0.5)
ElasticNet(alpha=0.5, l1_ratio=0.75)
SVR()
Ridge()


In [25]:
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

ElasticNet(alpha=0.5) 	 54359.24539331417
ElasticNet(alpha=0.5, l1_ratio=0.75) 	 53923.04627085828
SVR() 	 58601.12053478997
Ridge() 	 53282.813978524086


In [26]:
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

Ridge()

In [27]:
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

43677.709903250696


In [28]:
# Задание №2 - анализ моделей для задачи классификации
# Общий список моделей
c_models = [
    LogisticRegression(penalty='none', solver='saga'),
    LogisticRegression(penalty='l1', solver='saga'),
    LogisticRegression(penalty='l2', solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.25, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.75, solver='saga'),
    LogisticRegression(),
    MultinomialNB(alpha=0.0),
    MultinomialNB(alpha=0.25),
    MultinomialNB(alpha=0.5),
    MultinomialNB(alpha=0.75),
    MultinomialNB(alpha=1.0),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=10),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=20),
    KNeighborsClassifier(n_neighbors=25),
    SVC(kernel='linear'),
    SVC(kernel='poly'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid'),
    DecisionTreeClassifier(criterion='gini', splitter='best'),
    DecisionTreeClassifier(criterion='gini', splitter='random'),
    DecisionTreeClassifier(criterion='entropy', splitter='best'),
    DecisionTreeClassifier(criterion='entropy', splitter='random')
]

In [29]:
# Выбор моделей для задания
n = 5
my_models2 = random.sample(c_models, n)
print(my_models2)

[LogisticRegression(penalty='l1', solver='saga'), KNeighborsClassifier(), DecisionTreeClassifier(criterion='entropy'), DecisionTreeClassifier(splitter='random'), SVC(kernel='sigmoid')]


In [30]:
# Загрузим данные для задачи классификации
data2 = pd.read_csv("datasets/zoo2.csv")
data2

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,turtle,0,0,1,0,0,1,0,0,1,1,0,0,4,1,1,1,3
1,chameleon,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
2,iguana,0,0,1,0,0,0,1,1,1,1,0,0,4,1,1,1,3
3,lizard,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,3
4,gecko,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
5,python,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
6,boa,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,1,3
7,adder,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
8,crocodile,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3
9,alligator,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3


In [31]:
# Метка класса для всех одна и та же. Параметры выбираем случайнм образом.
columns = list(data2.columns)
n_x = 8

y_label2 = 'class_type'
x_labels2 = random.sample(columns[1:-1], n_x)

print(x_labels2)

['fins', 'tail', 'venomous', 'catsize', 'legs', 'backbone', 'toothed', 'breathes']


In [32]:
labels = x_labels2.copy()
labels.append(y_label2)


In [33]:
new_data2=data2[labels]
data2=new_data2

In [34]:
data2


Unnamed: 0,fins,tail,venomous,catsize,legs,backbone,toothed,breathes,class_type
0,0,1,0,1,4,1,0,1,3
1,0,1,0,0,4,1,1,1,3
2,0,1,0,1,4,1,1,1,3
3,0,1,0,0,4,1,1,1,3
4,0,1,0,0,4,1,1,1,3
5,0,1,1,1,0,1,1,1,3
6,0,1,0,1,0,1,1,1,3
7,0,1,1,1,0,1,1,1,3
8,0,1,0,1,4,1,1,1,3
9,0,1,0,1,4,1,1,1,3


In [35]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу классификации с помощью выбранных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

In [36]:
data2.shape

(43, 9)

In [37]:
# Определим размер валидационной и тестовой выборок
val_test_size2 = round(0.2*len(data2))
print(val_test_size2)

9


In [38]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val2, test2 = train_test_split(data2, test_size=val_test_size2, random_state=random_state)
train2, val2 = train_test_split(train_val2, test_size=val_test_size2, random_state=random_state)
print(len(train2), len(val2), len(test2))

25 9 9


In [39]:
train2.head()

Unnamed: 0,fins,tail,venomous,catsize,legs,backbone,toothed,breathes,class_type
24,0,0,0,0,4,1,1,1,5
8,0,1,0,1,4,1,1,1,3
7,0,1,1,1,0,1,1,1,3
3,0,1,0,0,4,1,1,1,3
23,0,0,0,0,4,1,1,1,5


In [40]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale2 = x_labels2

ct2 = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale2)], remainder='passthrough')
ct2.fit(train2)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['fins', 'tail', 'venomous', 'catsize', 'legs',
                                  'backbone', 'toothed', 'breathes'])])

In [41]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train2 = pd.DataFrame(ct2.transform(train2))
sc_test2 = pd.DataFrame(ct2.transform(test2))
sc_val2 = pd.DataFrame(ct2.transform(val2))

In [42]:
# Устанавливаем названия столбцов
column_names2 = columns_to_scale2 + ['class_type']
sc_train2.columns = column_names2
sc_test2.columns = column_names2
sc_val2.columns = column_names2

In [43]:
sc_val2

Unnamed: 0,fins,tail,venomous,catsize,legs,backbone,toothed,breathes,class_type
0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,1.0,6.0
1,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0
2,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0
3,0.0,1.0,0.0,0.0,0.5,1.0,1.0,1.0,3.0
4,0.0,0.0,0.0,0.0,0.75,0.0,0.0,1.0,6.0
5,0.0,1.0,0.0,1.0,0.5,1.0,0.0,1.0,3.0
6,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0
7,0.0,1.0,0.0,1.0,0.5,1.0,1.0,1.0,3.0
8,0.0,1.0,0.0,0.0,0.25,1.0,1.0,1.0,5.0


In [44]:
# Вспоминаем алгоритмы решения задачи классификации
c_models = []

# Логистическая регрессия
# Параметр penalty - тип регуляризации: 'l1', 'l2', 'elasticnet', 'none'}, по умолчанию 'l2'
# Для некоторых типов регуляризации доступны не все алгоритмы (параметр solver)
# Для elasticnet регуляризации необходимо уазывать параметр l1_ratio (0 - l2, 1 - l1)
c_models.append(LogisticRegression(penalty='l1', solver='saga'))


# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
c_models.append(KNeighborsClassifier(n_neighbors=5))

# Метод опорных векторов
# Параметр kernel опредеяет вид ядра преобразования
c_models.append(SVC(kernel='sigmoid'))

# Деревья решений
# Параметр criterion - критерий качества ветвления: 'gini', 'entropy'
# Параметр splitter - стартегия ветвления: 'best', 'random'
c_models.append(DecisionTreeClassifier())
c_models.append(DecisionTreeClassifier(splitter='random'))


In [45]:
c_models

[LogisticRegression(penalty='l1', solver='saga'),
 KNeighborsClassifier(),
 SVC(kernel='sigmoid'),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(splitter='random')]

In [46]:
x_labels2 = column_names2[0:-1]
y_labels2 = ['class_type']

x_train2 = sc_train2[x_labels2]
x_test2 = sc_test2[x_labels2]
x_val2 = sc_val2[x_labels2]

y_train2 = np.ravel(sc_train2[y_labels2])
y_test2 = np.ravel(sc_test2[y_labels2])
y_val2 = np.ravel(sc_val2[y_labels2])

In [47]:
x_train2

Unnamed: 0,fins,tail,venomous,catsize,legs,backbone,toothed,breathes
0,0.0,0.0,0.0,0.0,0.5,1.0,1.0,1.0
1,0.0,1.0,0.0,1.0,0.5,1.0,1.0,1.0
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
3,0.0,1.0,0.0,0.0,0.5,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.5,1.0,1.0,1.0
5,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
6,0.0,1.0,0.0,0.0,0.5,1.0,1.0,1.0
7,0.0,0.0,0.0,0.0,0.75,0.0,0.0,1.0
8,0.0,0.0,0.0,0.0,0.5,1.0,1.0,1.0
9,0.0,0.0,0.0,0.0,0.75,0.0,0.0,1.0


In [48]:
# Обучаем модели
for model2 in c_models:
    print(model2)
    model2.fit(x_train2, y_train2)

LogisticRegression(penalty='l1', solver='saga')
KNeighborsClassifier()
SVC(kernel='sigmoid')
DecisionTreeClassifier()
DecisionTreeClassifier(splitter='random')




In [49]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model2 in c_models:
    val_pred2 = model2.predict(x_val2)
    f1 = f1_score(y_val2, val_pred2, average='weighted')
    f1s.append(f1)
    print(model2, '\t', f1)

LogisticRegression(penalty='l1', solver='saga') 	 0.8412698412698413
KNeighborsClassifier() 	 0.4222222222222222
SVC(kernel='sigmoid') 	 0.4222222222222222
DecisionTreeClassifier() 	 0.8412698412698413
DecisionTreeClassifier(splitter='random') 	 0.8412698412698413


In [50]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_c_model = c_models[i_max]
best_c_model

LogisticRegression(penalty='l1', solver='saga')

In [51]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred2 = best_c_model.predict(x_test2)
f1 = f1_score(y_test2, test_pred2, average='weighted')
print(f1)

0.7037037037037037


In [52]:
# Вспоминаем алгоритм решения задачи кластеризации - метод k-средних
# Параметр n_clusters - количество кластеров, по умолчанию равен 8
k_models = []
k_models.append(KMeans(n_clusters=5))
k_models.append(KMeans(n_clusters=8))
k_models.append(KMeans(n_clusters=20))
k_models.append(KMeans(n_clusters=50))

In [53]:
# Выделим используемые параметры
x_labels2 = column_names[0:-1]
x = pd.concat([sc_train[x_labels], sc_val[x_labels], sc_test[x_labels]])
x

Unnamed: 0,visibility,wind,humidity,pressure,wl_change
0,0.04,0.0,0.945055,0.512821,0.388350
1,0.20,0.2,0.538462,0.657617,0.383495
2,0.20,0.4,0.824176,0.714178,0.402913
3,1.00,0.2,0.758242,0.499246,0.422330
4,0.00,0.0,0.956044,0.688537,0.480583
...,...,...,...,...,...
283,0.10,0.2,0.208791,0.766968,0.427184
284,1.00,0.3,0.362637,0.454751,0.383495
285,1.00,0.3,0.329670,0.699849,0.349515
286,1.00,0.0,1.000000,0.068627,0.388350


In [54]:
# Произведем кластеризацию
for model in k_models:
    model.fit(x)

In [55]:
# Оценим качество результата
sils = []
for model in k_models:
    cluster_labels = model.predict(x)
    s = silhouette_score(x, cluster_labels)
    sils.append(s)
    print(s)

0.3961767470160005
0.3106223365860281
0.23473793573937513
0.22839590070445687


In [56]:
# Выбираем лучшую модель
i_max = sils.index(max(sils))
best_k_model = k_models[i_max]
print(best_k_model)
print(sils[i_max])

KMeans(n_clusters=5)
0.3961767470160005
