In [1]:
# Задание на повторение материала предыдущего семестра

In [2]:
# Зависимости
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVR, SVC
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.cluster import KMeans

from sklearn.metrics import mean_squared_error, f1_score, silhouette_score

In [3]:
# Генерируем уникальный seed
my_code = "Сулима"
seed_limit = 2 ** 32
my_seed = int.from_bytes(my_code.encode(), "little") % seed_limit

In [4]:
# Данные загружены отсюда: https://www.kaggle.com/dwdkills/russian-demography
# Читаем данные из файла
example_data = pd.read_csv("datasets/russian_demography.csv")

In [5]:
# "year" - год (1990-2017)
# "region" - название региона
# "npg" - естественный прирост населения на 1000 человек
# "birth_rate" - количество рождений на 1000 человек
# "death_rate" - количество смертей на 1000 человек
# "gdw" - коэффициент демографической нагрузки на 100 человек (Отношение числа нетрудоспособных к числу трудоспособных).
# "urbanization" - процент городского населения

example_data.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
0,1990,Republic of Adygea,1.9,14.2,12.3,84.66,52.42
1,1990,Altai Krai,1.8,12.9,11.1,80.24,58.07
2,1990,Amur Oblast,7.6,16.2,8.6,69.55,68.37
3,1990,Arkhangelsk Oblast,3.7,13.5,9.8,73.26,73.63
4,1990,Astrakhan Oblast,4.7,15.1,10.4,77.05,68.01


In [6]:
# Так как список регионов меняется от года к году, в данных есть строки без значений. Удалим их
example_data.dropna(inplace=True)

In [7]:
example_data.shape

(2315, 7)

In [8]:
# Определим размер валидационной и тестовой выборок
val_test_size = round(0.2*len(example_data))
print(val_test_size)

463


In [9]:
# Создадим обучающую, валидационную и тестовую выборки
random_state = my_seed
train_val, test = train_test_split(example_data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

1389 463 463


In [10]:
train.head()

Unnamed: 0,year,region,npg,birth_rate,death_rate,gdw,urbanization
1844,2011,Saint Petersburg,-0.9,11.6,12.5,60.19,100.0
229,1992,Saint Petersburg,-5.9,7.6,13.5,69.65,100.0
1375,2006,Zabaykalsky Krai,-1.6,13.9,15.6,56.4,64.2
1052,2002,Kurgan Oblast,-7.3,10.4,17.7,70.66,56.36
290,1993,Lipetsk Oblast,-6.9,8.7,15.6,80.87,62.68


In [11]:
# Значения в числовых столбцах преобразуем к отрезку [0,1].
# Для настройки скалировщика используем только обучающую выборку.
columns_to_scale = ['year', 'npg', 'birth_rate', 'death_rate', 'gdw', 'urbanization']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['year', 'npg', 'birth_rate', 'death_rate',
                                  'gdw', 'urbanization'])])

In [12]:
# Преобразуем значения, тип данных приводим к DataFrame
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))

In [13]:
# Устанавливаем названия столбцов
column_names = columns_to_scale + ['region']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names

In [14]:
sc_val

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization,region
0,0.037037,0.41,0.272727,0.382775,0.688473,0.633757,Ulyanovsk Oblast
1,0.777778,0.225,0.17316,0.626794,0.550813,0.61896,Ryazan Oblast
2,0.407407,0.2575,0.108225,0.492823,0.475774,0.585439,Omsk Oblast
3,0.666667,0.095,0.121212,0.803828,0.492041,0.735498,Tula Oblast
4,0.074074,0.7875,0.722944,0.157895,0.999475,0.268299,Republic of Dagestan
...,...,...,...,...,...,...,...
458,0.259259,0.24,0.099567,0.516746,0.73605,0.416525,Kurgan Oblast
459,0.925926,0.36,0.281385,0.488038,0.51513,0.697525,Primorsky Krai
460,0.037037,0.475,0.155844,0.129187,0.299633,0.889747,Murmansk Oblast
461,0.592593,0.2775,0.168831,0.511962,0.444814,0.429095,Stavropol Krai


In [15]:
# Вспоминаем алгоритмы решения задачи регрессии
r_models = []

# Линейная регрессия
# Для использования регуляризации, вместо LinearRegression используем Lasso, Ridge или ElasticNet
# Параметр alpha - коэффициент регуляризации для Lasso и Ridge, по умолчанию равен 1
# Для ElasticNet, если регуляризация иммет вид a*L1+b*L2, то
# параметр alpha = a + b, по умолчанию равен 1
# параметр l1_ratio = a / (a + b), по умолчанию равен 0.5
r_models.append(LinearRegression())
r_models.append(Lasso(alpha=1.0))
r_models.append(Ridge(alpha=1.0))
r_models.append(ElasticNet(alpha=1.0, l1_ratio=0.5))

# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
r_models.append(KNeighborsRegressor(n_neighbors=5))
r_models.append(KNeighborsRegressor(n_neighbors=10))
r_models.append(KNeighborsRegressor(n_neighbors=15))

# Метод опорных векторов
# Параметр kernel опредеяет вид ядра преобразования
r_models.append(SVR(kernel='linear'))
r_models.append(SVR(kernel='poly'))
r_models.append(SVR(kernel='rbf'))
r_models.append(SVR(kernel='sigmoid'))

# Деревья решений
# Параметр criterion - критерий качества ветвления: 'mse', 'friedman_mse', 'mae', 'poisson'
r_models.append(DecisionTreeRegressor(criterion='squared_error'))
r_models.append(DecisionTreeRegressor(criterion='friedman_mse'))
r_models.append(DecisionTreeRegressor(criterion='absolute_error'))

In [16]:
r_models

[LinearRegression(),
 Lasso(),
 Ridge(),
 ElasticNet(),
 KNeighborsRegressor(),
 KNeighborsRegressor(n_neighbors=10),
 KNeighborsRegressor(n_neighbors=15),
 SVR(kernel='linear'),
 SVR(kernel='poly'),
 SVR(),
 SVR(kernel='sigmoid'),
 DecisionTreeRegressor(),
 DecisionTreeRegressor(criterion='friedman_mse'),
 DecisionTreeRegressor(criterion='absolute_error')]

In [17]:
# Выделим предикторы и зависимую переменную
x_labels = column_names[0:-2]
y_labels = ['urbanization']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [18]:
x_labels

['year', 'npg', 'birth_rate', 'death_rate', 'gdw']

In [19]:
y_labels

['urbanization']

In [20]:
# Обучаем модели
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

LinearRegression()
Lasso()
Ridge()
ElasticNet()
KNeighborsRegressor()
KNeighborsRegressor(n_neighbors=10)
KNeighborsRegressor(n_neighbors=15)
SVR(kernel='linear')
SVR(kernel='poly')
SVR()
SVR(kernel='sigmoid')
DecisionTreeRegressor()
DecisionTreeRegressor(criterion='friedman_mse')
DecisionTreeRegressor(criterion='absolute_error')


In [21]:
# Оценииваем качество работы моделей на валидационной выборке.
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

LinearRegression() 	 0.02168235843460972
Lasso() 	 0.031919615101791775
Ridge() 	 0.02171755092094028
ElasticNet() 	 0.031919615101791775
KNeighborsRegressor() 	 0.011085582285931518
KNeighborsRegressor(n_neighbors=10) 	 0.012331620516011065
KNeighborsRegressor(n_neighbors=15) 	 0.013221158925995865
SVR(kernel='linear') 	 0.021963771079237436
SVR(kernel='poly') 	 0.013253936346964139
SVR() 	 0.011575442423570372
SVR(kernel='sigmoid') 	 30.16369196159337
DecisionTreeRegressor() 	 0.02128330491459234
DecisionTreeRegressor(criterion='friedman_mse') 	 0.021209080649749478
DecisionTreeRegressor(criterion='absolute_error') 	 0.021346697486778487


In [22]:
# Выбираем лучшую модель
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

KNeighborsRegressor()

In [23]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

0.010368096735368435


In [24]:
# Вспоминаем алгоритмы решения задачи классификации
c_models = []

# Логистическая регрессия
# Параметр penalty - тип регуляризации: 'l1', 'l2', 'elasticnet', 'none'}, по умолчанию 'l2'
# Для некоторых типов регуляризации доступны не все алгоритмы (параметр solver)
# Для elasticnet регуляризации необходимо уазывать параметр l1_ratio (0 - l2, 1 - l1)
c_models.append(LogisticRegression(penalty='none', solver='saga', max_iter=500))
c_models.append(LogisticRegression(penalty='l1', solver='saga', max_iter=500))
c_models.append(LogisticRegression(penalty='l2', solver='saga'))
c_models.append(LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga'))
c_models.append(LogisticRegression())

# Наивный байесовский классификатор
# Параметр alpha - параметр сглаживания, по умолчанию равен 1 (сглаживание Лапласа)
c_models.append(MultinomialNB(alpha=0.0))
c_models.append(MultinomialNB(alpha=0.5))
c_models.append(MultinomialNB(alpha=1.0))

# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
c_models.append(KNeighborsClassifier(n_neighbors=5))
c_models.append(KNeighborsClassifier(n_neighbors=10))
c_models.append(KNeighborsClassifier(n_neighbors=15))

# Метод опорных векторов
# Параметр kernel опредеяет вид ядра преобразования
c_models.append(SVC(kernel='linear'))
c_models.append(SVC(kernel='poly'))
c_models.append(SVC(kernel='rbf'))
c_models.append(SVC(kernel='sigmoid'))

# Деревья решений
# Параметр criterion - критерий качества ветвления: 'gini', 'entropy'
# Параметр splitter - стартегия ветвления: 'best', 'random'
c_models.append(DecisionTreeClassifier(criterion='gini', splitter='best'))
c_models.append(DecisionTreeClassifier(criterion='gini', splitter='random'))
c_models.append(DecisionTreeClassifier(criterion='entropy', splitter='best'))
c_models.append(DecisionTreeClassifier(criterion='entropy', splitter='random'))

In [25]:
c_models

[LogisticRegression(max_iter=500, penalty='none', solver='saga'),
 LogisticRegression(max_iter=500, penalty='l1', solver='saga'),
 LogisticRegression(solver='saga'),
 LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga'),
 LogisticRegression(),
 MultinomialNB(alpha=0.0),
 MultinomialNB(alpha=0.5),
 MultinomialNB(),
 KNeighborsClassifier(),
 KNeighborsClassifier(n_neighbors=10),
 KNeighborsClassifier(n_neighbors=15),
 SVC(kernel='linear'),
 SVC(kernel='poly'),
 SVC(),
 SVC(kernel='sigmoid'),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(splitter='random'),
 DecisionTreeClassifier(criterion='entropy'),
 DecisionTreeClassifier(criterion='entropy', splitter='random')]

In [26]:
# Выделим предикторы и метки классов
x_labels = column_names[0:-1]
y_labels = ['region']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [27]:
x_train

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization
0,0.777778,0.3575,0.233766,0.440191,0.390764,1.0
1,0.074074,0.2325,0.060606,0.488038,0.556236,1.0
2,0.592593,0.34,0.333333,0.588517,0.324471,0.53123
3,0.444444,0.1975,0.181818,0.688995,0.573902,0.428571
4,0.111111,0.2075,0.108225,0.588517,0.752493,0.511326
...,...,...,...,...,...,...
1384,0.814815,0.2225,0.186147,0.645933,0.467203,0.64122
1385,0.037037,0.425,0.238095,0.315789,0.539619,0.724237
1386,0.925926,0.295,0.25974,0.588517,0.693546,0.760377
1387,0.62963,0.3025,0.220779,0.526316,0.464055,0.381956


In [28]:
# Обучаем модели
for model in c_models:
    print(model)
    model.fit(x_train, y_train)

LogisticRegression(max_iter=500, penalty='none', solver='saga')




LogisticRegression(max_iter=500, penalty='l1', solver='saga')
LogisticRegression(solver='saga')
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga')
LogisticRegression()
MultinomialNB(alpha=0.0)


  % _ALPHA_MIN


MultinomialNB(alpha=0.5)
MultinomialNB()
KNeighborsClassifier()
KNeighborsClassifier(n_neighbors=10)
KNeighborsClassifier(n_neighbors=15)
SVC(kernel='linear')
SVC(kernel='poly')
SVC()
SVC(kernel='sigmoid')
DecisionTreeClassifier()
DecisionTreeClassifier(splitter='random')
DecisionTreeClassifier(criterion='entropy')
DecisionTreeClassifier(criterion='entropy', splitter='random')


In [29]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model in c_models:
    val_pred = model.predict(x_val)
    f1 = f1_score(y_val, val_pred, average='weighted')
    f1s.append(f1)
    print(model, '\t', f1)

LogisticRegression(max_iter=500, penalty='none', solver='saga') 	 0.34055628252764303
LogisticRegression(max_iter=500, penalty='l1', solver='saga') 	 0.1201769762118048
LogisticRegression(solver='saga') 	 0.060884187524038375
LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga') 	 0.08261936185329334
LogisticRegression() 	 0.060884187524038375
MultinomialNB(alpha=0.0) 	 0.012926733552357687
MultinomialNB(alpha=0.5) 	 0.012907595239092809
MultinomialNB() 	 0.012884538549215415
KNeighborsClassifier() 	 0.3148254646114152
KNeighborsClassifier(n_neighbors=10) 	 0.2295884653720547
KNeighborsClassifier(n_neighbors=15) 	 0.2000696711941187
SVC(kernel='linear') 	 0.07618230856977021
SVC(kernel='poly') 	 0.7154113425790269
SVC() 	 0.16015449016641492
SVC(kernel='sigmoid') 	 0.0
DecisionTreeClassifier() 	 0.6281451090306381
DecisionTreeClassifier(splitter='random') 	 0.5839013257755609
DecisionTreeClassifier(criterion='entropy') 	 0.6195453410098777
DecisionTreeClassifier(criteri

In [30]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_c_model = c_models[i_max]
best_c_model

SVC(kernel='poly')

In [31]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred = best_c_model.predict(x_test)
f1 = f1_score(y_test, test_pred, average='weighted')
print(f1)

0.655328829302766


In [32]:
# Вспоминаем алгоритм решения задачи кластеризации - метод k-средних
# Параметр n_clusters - количество кластеров, по умолчанию равен 8
k_models = []
k_models.append(KMeans(n_clusters=5))
k_models.append(KMeans(n_clusters=8))
k_models.append(KMeans(n_clusters=20))
k_models.append(KMeans(n_clusters=50))

In [33]:
# Выделим используемые параметры
x_labels = column_names[0:-1]
x = pd.concat([sc_train[x_labels], sc_val[x_labels], sc_test[x_labels]])
x

Unnamed: 0,year,npg,birth_rate,death_rate,gdw,urbanization
0,0.777778,0.3575,0.233766,0.440191,0.390764,1.0
1,0.074074,0.2325,0.060606,0.488038,0.556236,1.0
2,0.592593,0.34,0.333333,0.588517,0.324471,0.53123
3,0.444444,0.1975,0.181818,0.688995,0.573902,0.428571
4,0.111111,0.2075,0.108225,0.588517,0.752493,0.511326
...,...,...,...,...,...,...
458,0.222222,0.2425,0.095238,0.507177,0.633724,0.661254
459,0.592593,0.0875,0.155844,0.870813,0.461081,0.617651
460,0.444444,0.2725,0.220779,0.588517,0.428022,0.602855
461,0.074074,0.4025,0.255411,0.37799,0.708414,0.57575


In [34]:
# Произведем кластеризацию
for model in k_models:
    model.fit(x)

In [35]:
# Оценим качество результата
sils = []
for model in k_models:
    cluster_labels = model.predict(x)
    s = silhouette_score(x, cluster_labels)
    sils.append(s)
    print(s)

0.29467491446026817
0.24307378125705817
0.23771085371691597
0.232409799446959


In [36]:
# Выбираем лучшую модель
i_max = sils.index(max(sils))
best_k_model = k_models[i_max]
print(best_k_model)
print(sils[i_max])

KMeans(n_clusters=5)
0.29467491446026817


In [37]:
# Задание №1 - анализ моделей для задачи регрессии
# Общий список моделей
r_models = [
    LinearRegression(),
    Lasso(alpha=1.0),
    Lasso(alpha=0.5),
    Ridge(alpha=1.0),
    Ridge(alpha=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.5),
    ElasticNet(alpha=1.0, l1_ratio=0.25),
    ElasticNet(alpha=1.0, l1_ratio=0.75),
    ElasticNet(alpha=0.5, l1_ratio=0.5),
    ElasticNet(alpha=0.5, l1_ratio=0.25),
    ElasticNet(alpha=0.5, l1_ratio=0.75),
    KNeighborsRegressor(n_neighbors=5),
    KNeighborsRegressor(n_neighbors=10),
    KNeighborsRegressor(n_neighbors=15),
    KNeighborsRegressor(n_neighbors=20),
    KNeighborsRegressor(n_neighbors=25),
    SVR(kernel='linear'),
    SVR(kernel='poly'),
    SVR(kernel='rbf'),
    SVR(kernel='sigmoid'),
    DecisionTreeRegressor(criterion='squared_error'),
    DecisionTreeRegressor(criterion='friedman_mse'),
    DecisionTreeRegressor(criterion='absolute_error')
]

In [38]:
# Выбор моделей для задания
n = 4
random.seed(my_seed)
my_models1 = random.sample(r_models, n)
print(my_models1)

[DecisionTreeRegressor(criterion='friedman_mse'), SVR(kernel='poly'), DecisionTreeRegressor(criterion='absolute_error'), ElasticNet(alpha=0.5, l1_ratio=0.75)]


In [39]:
# Загрузим данные для задачи регрессии
data = pd.read_csv("datasets/weather.csv", decimal=',')
data

Unnamed: 0,water_level,precipitation,temperature,humidity,visibility,wind,weather,pressure,fire,wl_change,temp_change,pressure_change
0,468.0,0.0,24.9,31.0,100.0,4.0,0.0,993.8,4.0,-3.0,-0.3,-0.4
1,140.0,0.0,27.9,22.0,20.0,1.0,0.0,1004.1,4.0,-2.0,0.6,-1.0
2,464.0,0.0,25.6,28.0,20.0,2.0,1.0,1001.2,4.0,-7.0,-1.8,-0.3
3,467.0,0.0,26.7,26.0,100.0,3.0,0.0,992.3,4.0,-1.0,1.2,-1.3
4,138.0,0.0,29.1,22.0,20.0,1.0,0.0,1001.3,4.0,-2.0,2.5,-1.3
...,...,...,...,...,...,...,...,...,...,...,...,...
1434,474.0,0.1,6.4,95.0,4.0,1.0,3.0,999.9,4.0,-1.0,5.9,1.5
1435,65.0,0.0,6.4,93.0,10.0,4.0,3.0,983.9,4.0,-1.0,5.2,3.1
1436,159.0,0.1,8.5,83.0,10.0,2.0,1.0,969.7,4.0,1.0,1.4,1.9
1437,99.0,0.1,7.5,91.0,10.0,3.0,3.0,995.4,4.0,-4.0,2.7,1.7


In [40]:
# Зависимая переменная для всех одна и та же. Предикторы выбираем случайнм образом.
columns = list(data.columns)
n_x = 5

y_label = 'water_level'
x_labels = random.sample(columns[1:], n_x)

print(x_labels)

['fire', 'precipitation', 'temperature', 'weather', 'pressure_change']


In [41]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу регрессии с помощью выбранных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

data = data.drop(['precipitation','temperature','visibility','wind','wl_change','temp_change'],axis=1)


In [42]:
data

Unnamed: 0,water_level,humidity,weather,pressure,fire,pressure_change
0,468.0,31.0,0.0,993.8,4.0,-0.4
1,140.0,22.0,0.0,1004.1,4.0,-1.0
2,464.0,28.0,1.0,1001.2,4.0,-0.3
3,467.0,26.0,0.0,992.3,4.0,-1.3
4,138.0,22.0,0.0,1001.3,4.0,-1.3
...,...,...,...,...,...,...
1434,474.0,95.0,3.0,999.9,4.0,1.5
1435,65.0,93.0,3.0,983.9,4.0,3.1
1436,159.0,83.0,1.0,969.7,4.0,1.9
1437,99.0,91.0,3.0,995.4,4.0,1.7


In [43]:
data.shape
val_test_size = round(0.2*len(data))
print(val_test_size)

288


In [44]:
random_state = my_seed
train_val, test = train_test_split(data, test_size=val_test_size, random_state=random_state)
train, val = train_test_split(train_val, test_size=val_test_size, random_state=random_state)
print(len(train), len(val), len(test))

863 288 288


In [45]:
train.head()

Unnamed: 0,water_level,humidity,weather,pressure,fire,pressure_change
1346,123.0,90.0,1.0,1001.1,4.0,0.6
536,217.0,98.0,6.0,1026.8,4.0,0.3
1422,594.0,96.0,6.0,1009.2,4.0,0.7
440,422.0,93.0,6.0,995.8,4.0,0.0
485,132.0,65.0,0.0,1010.1,4.0,0.5


In [46]:
columns_to_scale = ['fire', 'weather', 'pressure', 'pressure_change', 'humidity']

ct = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale)], remainder='passthrough')
ct.fit(train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['fire', 'weather', 'pressure',
                                  'pressure_change', 'humidity'])])

In [47]:
sc_train = pd.DataFrame(ct.transform(train))
sc_test = pd.DataFrame(ct.transform(test))
sc_val = pd.DataFrame(ct.transform(val))
column_names = columns_to_scale + ['water_level']
sc_train.columns = column_names
sc_test.columns = column_names
sc_val.columns = column_names
sc_val

Unnamed: 0,fire,weather,pressure,pressure_change,humidity,water_level
0,0.0,0.250,0.717164,0.511628,0.505618,200.0
1,1.0,0.000,0.757463,0.325581,0.584270,590.0
2,0.0,0.250,0.690299,0.511628,0.775281,384.0
3,0.0,0.250,0.712687,0.279070,0.247191,243.0
4,0.0,0.875,0.955224,0.290698,0.471910,216.0
...,...,...,...,...,...,...
283,0.0,0.125,0.747015,0.104651,0.123596,223.0
284,0.0,0.250,0.238060,0.523256,0.640449,83.0
285,0.0,0.125,0.909701,0.302326,0.460674,216.0
286,1.0,0.250,0.791791,0.372093,0.707865,39.0


In [48]:
r_models = []

r_models.append(ElasticNet(alpha=0.5))
r_models.append(ElasticNet(alpha=0.5, l1_ratio=0.75))

r_models.append(SVR())

r_models.append(Ridge())
r_models

[ElasticNet(alpha=0.5), ElasticNet(alpha=0.5, l1_ratio=0.75), SVR(), Ridge()]

In [49]:
x_labels = column_names[0:-1]
y_labels = ['water_level']

x_train = sc_train[x_labels]
x_test = sc_test[x_labels]
x_val = sc_val[x_labels]

y_train = np.ravel(sc_train[y_labels])
y_test = np.ravel(sc_test[y_labels])
y_val = np.ravel(sc_val[y_labels])

In [50]:
for model in r_models:
    print(model)
    model.fit(x_train, y_train)

ElasticNet(alpha=0.5)
ElasticNet(alpha=0.5, l1_ratio=0.75)
SVR()
Ridge()


In [51]:
mses = []
for model in r_models:
    val_pred = model.predict(x_val)
    mse = mean_squared_error(y_val, val_pred)
    mses.append(mse)
    print(model, '\t', mse)

ElasticNet(alpha=0.5) 	 54253.80178695069
ElasticNet(alpha=0.5, l1_ratio=0.75) 	 53710.30973058152
SVR() 	 58403.483087620516
Ridge() 	 51107.04741923048


In [52]:
i_min = mses.index(min(mses))
best_r_model = r_models[i_min]
best_r_model

Ridge()

In [53]:
test_pred = best_r_model.predict(x_test)
mse = mean_squared_error(y_test, test_pred)
print(mse)

42445.01958501961


In [54]:
# Задание №2 - анализ моделей для задачи классификации
# Общий список моделей
c_models = [
    LogisticRegression(penalty='none', solver='saga'),
    LogisticRegression(penalty='l1', solver='saga'),
    LogisticRegression(penalty='l2', solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.25, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.5, solver='saga'),
    LogisticRegression(penalty='elasticnet', l1_ratio=0.75, solver='saga'),
    LogisticRegression(),
    MultinomialNB(alpha=0.0),
    MultinomialNB(alpha=0.25),
    MultinomialNB(alpha=0.5),
    MultinomialNB(alpha=0.75),
    MultinomialNB(alpha=1.0),
    KNeighborsClassifier(n_neighbors=5),
    KNeighborsClassifier(n_neighbors=10),
    KNeighborsClassifier(n_neighbors=15),
    KNeighborsClassifier(n_neighbors=20),
    KNeighborsClassifier(n_neighbors=25),
    SVC(kernel='linear'),
    SVC(kernel='poly'),
    SVC(kernel='rbf'),
    SVC(kernel='sigmoid'),
    DecisionTreeClassifier(criterion='gini', splitter='best'),
    DecisionTreeClassifier(criterion='gini', splitter='random'),
    DecisionTreeClassifier(criterion='entropy', splitter='best'),
    DecisionTreeClassifier(criterion='entropy', splitter='random')
]

In [55]:
# Выбор моделей для задания
n = 5
my_models2 = random.sample(c_models, n)
print(my_models2)

[LogisticRegression(l1_ratio=0.5, penalty='elasticnet', solver='saga'), DecisionTreeClassifier(criterion='entropy', splitter='random'), SVC(kernel='poly'), LogisticRegression(penalty='none', solver='saga'), LogisticRegression()]


In [56]:
# Загрузим данные для задачи классификации
newdata = pd.read_csv("datasets/zoo2.csv")
newdata

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,class_type
0,turtle,0,0,1,0,0,1,0,0,1,1,0,0,4,1,1,1,3
1,chameleon,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
2,iguana,0,0,1,0,0,0,1,1,1,1,0,0,4,1,1,1,3
3,lizard,0,0,1,0,0,0,1,1,1,1,0,0,4,1,0,0,3
4,gecko,0,0,1,0,0,0,0,1,1,1,0,0,4,1,1,0,3
5,python,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
6,boa,0,0,1,0,0,0,1,1,1,1,0,0,0,1,0,1,3
7,adder,0,0,1,0,0,0,1,1,1,1,1,0,0,1,0,1,3
8,crocodile,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3
9,alligator,0,0,1,0,0,1,1,1,1,1,0,0,4,1,0,1,3


In [57]:

# Метка класса для всех одна и та же. Параметры выбираем случайнм образом.
columns = list(newdata.columns)
n_x = 8

y_newlabel = 'class_type'
x_newlabels = random.sample(columns[1:-1], n_x)

print(x_newlabels)

['aquatic', 'legs', 'tail', 'catsize', 'feathers', 'airborne', 'toothed', 'venomous']


In [58]:
labels = x_newlabels.copy()
labels.append(y_newlabel)
temp_data = newdata[labels]
sec_data = temp_data

In [59]:
sec_data

Unnamed: 0,aquatic,legs,tail,catsize,feathers,airborne,toothed,venomous,class_type
0,1,4,1,1,0,0,0,0,3
1,0,4,1,0,0,0,1,0,3
2,0,4,1,1,0,0,1,0,3
3,0,4,1,0,0,0,1,0,3
4,0,4,1,0,0,0,1,0,3
5,0,0,1,1,0,0,1,1,3
6,0,0,1,1,0,0,1,0,3
7,0,0,1,1,0,0,1,1,3
8,1,4,1,1,0,0,1,0,3
9,1,4,1,1,0,0,1,0,3


In [60]:
# Преобразуйте значения всех необходимых параметров к отрезку [0,1].
# Решите получившуюся задачу классификации с помощью выбранных моделей и сравните их эффективность.
# Укажите, какая модель решает задачу лучше других.

sec_data.shape
val_test_size2 = round(0.2*len(sec_data))
print(val_test_size2)

9


In [62]:
random_state = my_seed
train_val2, test2 = train_test_split(sec_data, test_size=val_test_size2, random_state=random_state)
sec_train, val2 = train_test_split(train_val2, test_size=val_test_size2, random_state=random_state)
print(len(train2), len(val2), len(test2))

NameError: name 'train2' is not defined

In [63]:
sec_train.head()

Unnamed: 0,aquatic,legs,tail,catsize,feathers,airborne,toothed,venomous,class_type
0,1,4,1,1,0,0,0,0,3
32,0,6,0,0,0,0,0,0,6
24,1,4,0,0,0,0,1,0,5
39,0,0,0,0,0,0,0,0,7
22,1,4,0,0,0,0,1,0,5


In [64]:
columns_to_scale2 = x_newlabels

ct2 = ColumnTransformer(transformers=[('numerical', MinMaxScaler(), columns_to_scale2)], remainder='passthrough')
ct2.fit(sec_train)

ColumnTransformer(remainder='passthrough',
                  transformers=[('numerical', MinMaxScaler(),
                                 ['aquatic', 'legs', 'tail', 'catsize',
                                  'feathers', 'airborne', 'toothed',
                                  'venomous'])])

In [65]:
sc_train2 = pd.DataFrame(ct2.transform(sec_train))
sc_test2 = pd.DataFrame(ct2.transform(test2))
sc_val2 = pd.DataFrame(ct2.transform(val2))
column_names2 = columns_to_scale2 + ['class_type']
sc_train2.columns = column_names2
sc_test2.columns = column_names2
sc_val2.columns = column_names2
sc_val2

Unnamed: 0,aquatic,legs,tail,catsize,feathers,airborne,toothed,venomous,class_type
0,0.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0,3.0
1,0.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0,6.0
2,1.0,0.5,1.0,1.0,0.0,0.0,1.0,0.0,3.0
3,0.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0,3.0
4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0
5,0.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0,3.0
6,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,6.0
7,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,3.0
8,1.0,0.25,1.0,0.0,0.0,0.0,1.0,0.0,5.0


In [66]:
# Вспоминаем алгоритмы решения задачи классификации
c_models = []

# Логистическая регрессия
# Параметр penalty - тип регуляризации: 'l1', 'l2', 'elasticnet', 'none'}, по умолчанию 'l2'
# Для некоторых типов регуляризации доступны не все алгоритмы (параметр solver)
# Для elasticnet регуляризации необходимо уазывать параметр l1_ratio (0 - l2, 1 - l1)
c_models.append(LogisticRegression(penalty='l1', solver='saga'))


# K ближайших соседей
# Параметр n_neighbors - количество соседей, по умолчания равен 5
c_models.append(KNeighborsClassifier(n_neighbors=5))

# Метод опорных векторов
# Параметр kernel опредеяет вид ядра преобразования
c_models.append(SVC(kernel='sigmoid'))

# Деревья решений
# Параметр criterion - критерий качества ветвления: 'gini', 'entropy'
# Параметр splitter - стартегия ветвления: 'best', 'random'
c_models.append(DecisionTreeClassifier())
c_models.append(DecisionTreeClassifier(splitter='random'))
c_models
#[MultinomialNB(), KNeighborsClassifier, DecisionTreeClassifier(), LogisticRegression, KNeighborsClassifier]

[LogisticRegression(penalty='l1', solver='saga'),
 KNeighborsClassifier(),
 SVC(kernel='sigmoid'),
 DecisionTreeClassifier(),
 DecisionTreeClassifier(splitter='random')]

In [78]:
x_labels2 = column_names2[0:-1]
y_labels2 = ['class_type']

x_train2 = sc_train2[x_labels2]
x_test2 = sc_test2[x_labels2]
x_val2 = sc_val2[x_labels2]

y_train2 = np.ravel(sc_train2[y_labels2])
y_test2 = np.ravel(sc_test2[y_labels2])
y_val2 = np.ravel(sc_val2[y_labels2])
x_train2

Unnamed: 0,aquatic,legs,tail,catsize,feathers,airborne,toothed,venomous
0,1.0,0.5,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.75,0.0,0.0,0.0,1.0,0.0,1.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0


In [79]:
# Обучаем модели
for model2 in c_models:
    print(model2)
    model2.fit(x_train2, y_train2)

LogisticRegression(penalty='l1', solver='saga')
KNeighborsClassifier()
SVC(kernel='sigmoid')
DecisionTreeClassifier()
DecisionTreeClassifier(splitter='random')




In [80]:
# Оценииваем качество работы моделей на валидационной выборке.
f1s = []
for model2 in c_models:
    val_pred2 = model2.predict(x_val2)
    f1 = f1_score(y_val2, val_pred2, average='weighted')
    f1s.append(f1)
    print(model2, '\t', f1)

LogisticRegression(penalty='l1', solver='saga') 	 0.6262626262626262
KNeighborsClassifier() 	 0.5259259259259259
SVC(kernel='sigmoid') 	 1.0
DecisionTreeClassifier() 	 0.8518518518518519
DecisionTreeClassifier(splitter='random') 	 0.8383838383838383


In [81]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_c_model = c_models[i_max]
best_c_model

SVC(kernel='sigmoid')

In [82]:
# Выбираем лучшую модель
i_max = f1s.index(max(f1s))
best_c_model = c_models[i_max]
best_c_model

SVC(kernel='sigmoid')

In [83]:
# Вычислим ошибку лучшей модели на тестовой выборке.
test_pred2 = best_c_model.predict(x_test2)
f1 = f1_score(y_test2, test_pred2, average='weighted')
print(f1)


0.6962962962962963


In [84]:
# Вспоминаем алгоритм решения задачи кластеризации - метод k-средних
# Параметр n_clusters - количество кластеров, по умолчанию равен 8
k_models = []
k_models.append(KMeans(n_clusters=5))
k_models.append(KMeans(n_clusters=8))
k_models.append(KMeans(n_clusters=20))
k_models.append(KMeans(n_clusters=40))

In [85]:
# Выделим используемые параметры
x_labels2 = column_names2[0:-1]
x_data2 = pd.concat([sc_train2[x_labels2], sc_val2[x_labels2], sc_test2[x_labels2]])
x_data2

Unnamed: 0,aquatic,legs,tail,catsize,feathers,airborne,toothed,venomous
0,1.0,0.5,1.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0
6,0.0,0.75,0.0,0.0,0.0,1.0,0.0,1.0
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.5,1.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.75,0.0,0.0,0.0,1.0,0.0,0.0


In [75]:
# Произведем кластеризацию
for model2 in k_models:
    model2.fit(x_data2)

  This is separate from the ipykernel package so we can avoid doing imports until


In [76]:
# Оценим качество результата
sils = []
for model2 in k_models:
    cluster_labels = model2.predict(x_data2)
    s = silhouette_score(x_data2, cluster_labels)
    sils.append(s)
    print(s)

0.46085792451395485
0.587963739011304
0.7209302325581395
0.7209302325581395


In [77]:
# Выбираем лучшую модель
i_max = sils.index(max(sils))
best_k_model = k_models[i_max]
print(best_k_model)
print(sils[i_max])

KMeans(n_clusters=20)
0.7209302325581395
