# Мое решение Задачи 1 "Изучение критической температуры сверхпроводников"

Данные о сверхпроводниках взяты из базы данных сверхпроводящих материалов, собранной Национальным институтом материаловедения Японии NIMS. 

Данные содержат сведения о **21 263 сверхпроводников**.

*   Тренировочный набор - **17010** строк.
*   Тестовый набор - **4253** строк.

Для каждого сверхпроводника в данных приведены полная химическая формула сверхпроводника, а также 8 основных химических свойств (абсолютное значение, среднее, взвешенное среднее и так далее): атомная масса, энергия ионизации, радиус атома, плотность, удельная теплота плавления, энергия сродства к электрону, теплопроводность, валентность.


In [1]:
# Импортируем модули
import os
import time
import re
import joblib

# уже установленные библиотеки
import numpy as np
import pandas as pd
import sklearn.metrics as sk_metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt
import seaborn as sns

# импорты из LightAutoML
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task

  from .autonotebook import tqdm as notebook_tqdm


## Считывание данных

In [2]:
# Считываем тренировочные данные
train = pd.read_csv("data/train.csv")
formula_train = pd.read_csv("data/formula_train.csv")

print(f"Train dataset shape: {train.shape}")
print(f"Train_formula dataset shape: {formula_train.shape}")

Train dataset shape: (17010, 82)
Train_formula dataset shape: (17010, 88)


In [3]:
# Считываем тестовые данные
test = pd.read_csv("data/test.csv")
formula_test = pd.read_csv("data/formula_test.csv")

print(f"Test dataset shape: {test.shape}")
print(f"Test_formula dataset shape: {formula_test.shape}")

Test dataset shape: (4253, 81)
Test_formula dataset shape: (4253, 87)


In [4]:
# Посмотрим как выглядят данные
train.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.1,0.433013,0.440952,22.0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.0


In [5]:
formula_train.head()

Unnamed: 0,H,He,Li,Be,B,C,N,O,F,Ne,...,Au,Hg,Tl,Pb,Bi,Po,At,Rn,critical_temp,material
0,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,29.0,Ba0.2La1.8Cu1O4
1,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,26.0,Ba0.1La1.9Ag0.1Cu0.9O4
2,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,19.0,Ba0.1La1.9Cu1O4
3,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,22.0,Ba0.15La1.85Cu1O4
4,0.0,0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,23.0,Ba0.3La1.7Cu1O4


## Исследование данных

Прежде чем переходить к непосредственному построению модели, в первую очередь необходимо исследовать имеющихся у вас набор данных:
1. Какую задачу вы решаете (классификация, регрессия) и какая переменная является целевой?
2. Сколько переменных в данных и какого они типа?
3. Имеются ли в данных пропуски?

In [6]:
print("Number of missing values in train datasets:")
print(train.isna().sum())
print(formula_train.isna().sum())

Number of missing values in train datasets:
number_of_elements       0
mean_atomic_mass         0
wtd_mean_atomic_mass     0
gmean_atomic_mass        0
wtd_gmean_atomic_mass    0
                        ..
range_Valence            0
wtd_range_Valence        0
std_Valence              0
wtd_std_Valence          0
critical_temp            0
Length: 82, dtype: int64
H                0
He               0
Li               0
Be               0
B                0
                ..
Po               0
At               0
Rn               0
critical_temp    0
material         0
Length: 88, dtype: int64


In [7]:
print("Number of missing values in test datasets:")
print(test.isna().sum())
print(formula_test.isna().sum())

Number of missing values in test datasets:
number_of_elements       0
mean_atomic_mass         0
wtd_mean_atomic_mass     0
gmean_atomic_mass        0
wtd_gmean_atomic_mass    0
                        ..
wtd_entropy_Valence      0
range_Valence            0
wtd_range_Valence        0
std_Valence              0
wtd_std_Valence          0
Length: 81, dtype: int64
H           0
He          0
Li          0
Be          0
B           0
           ..
Bi          0
Po          0
At          0
Rn          0
material    0
Length: 87, dtype: int64


In [8]:
# Полные сведения о каждой переменной в тренировочном наборе данных
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17010 entries, 0 to 17009
Data columns (total 82 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   number_of_elements               17010 non-null  int64  
 1   mean_atomic_mass                 17010 non-null  float64
 2   wtd_mean_atomic_mass             17010 non-null  float64
 3   gmean_atomic_mass                17010 non-null  float64
 4   wtd_gmean_atomic_mass            17010 non-null  float64
 5   entropy_atomic_mass              17010 non-null  float64
 6   wtd_entropy_atomic_mass          17010 non-null  float64
 7   range_atomic_mass                17010 non-null  float64
 8   wtd_range_atomic_mass            17010 non-null  float64
 9   std_atomic_mass                  17010 non-null  float64
 10  wtd_std_atomic_mass              17010 non-null  float64
 11  mean_fie                         17010 non-null  float64
 12  wtd_mean_fie      

## Предобработка данных

### **Тренировочные данные.**

В данной задаче у нас имеются два датасета для тренировки и для теста, для удобства сформируем один DataFrame.

Так как колонка с целевой переменной имеется в обоих датасетах, удалим её из одного, чтобы не дублировать.

In [9]:
formula_train = formula_train.drop(columns=['critical_temp'])

train_full = pd.concat([train, formula_train], axis=1)

print(f"Full Train dataset shape: {train_full.shape}")

Full Train dataset shape: (17010, 169)


In [10]:
# Удалим из данных ненужную колонку 'material'
train_full.drop(columns=['material'], inplace=True)

In [11]:
train_full.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,Ir,Pt,Au,Hg,Tl,Pb,Bi,Po,At,Rn
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.97598,122.90607,35.741099,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
3,4,88.944468,57.873967,66.361592,36.11956,1.181795,1.022291,122.90607,33.76801,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0


In [12]:
# Разобьем выборку на обучающую и валидационную
tr_data, valid_data = train_test_split(train_full, 
                                       test_size=0.2,
                                       random_state=42)

print(f'Parts sizes: tr_data = {tr_data.shape}, valid_data = {valid_data.shape}')

Parts sizes: tr_data = (13608, 168), valid_data = (3402, 168)


### **Тестовые данные.**

Обработаем аналогичным образом тестовые данные.

In [13]:
test_full = pd.concat([test, formula_test], axis=1)

print(f"Full Test dataset shape: {test_full.shape}")

Full Test dataset shape: (4253, 168)


In [14]:
# Удалим из данных ненужную колонку 'material'
test_full.drop(columns=['material'], inplace=True)

## Обучение модели

In [15]:
# https://lightautoml.readthedocs.io/en/latest/pages/modules/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task
task = Task('reg', loss= 'mse', metric = 'mse')

Не похоже чтобы колонки были категориальными оставляем просто как есть и пусть LAMA сама разберется =)
Исключать какие либо колонки из рассмотрения тоже пока не будем.

In [16]:
roles = {'target': 'critical_temp'}

In [17]:
automl = TabularAutoML(task = task, 
                    timeout = 3600 * 3, # 3600 секунд = 1 час
                    general_params = {'use_algos': [['lgb']]},
                    lgb_params = {'default_params': {'num_iterations': 5000}},
                    reader_params = {'n_jobs': 12},
                    timing_params ={'mode': 0}
                    )

In [18]:
# запускаем обучение на данных и подбор оптимальных параметров
# из ансамбля моделей
oof_pred = automl.fit_predict(tr_data, roles = roles, verbose=4)

[17:35:17] Stdout logging level is DEBUG.
[17:35:18] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[17:35:18] Task: reg

[17:35:18] Start automl preset with listed constraints:
[17:35:18] - time: 10800.00 seconds
[17:35:18] - CPU: 4 cores
[17:35:18] - memory: 16 GB

[17:35:18] [1mTrain data shape: (13608, 168)[0m

[17:35:19] Feats was rejected during automatic roles guess: []
[17:35:19] Layer [1m1[0m train process start. Time left 10798.53 secs
[17:35:19] Training until validation scores don't improve for 200 rounds




[17:35:19] [100]	valid's l2: 158.801
[17:35:20] [200]	valid's l2: 115.54
[17:35:20] [300]	valid's l2: 106.012
[17:35:20] [400]	valid's l2: 100.991
[17:35:20] [500]	valid's l2: 97.8789
[17:35:20] [600]	valid's l2: 95.9405
[17:35:21] [700]	valid's l2: 94.1282
[17:35:21] [800]	valid's l2: 93.0594
[17:35:21] [900]	valid's l2: 92.3012
[17:35:21] [1000]	valid's l2: 91.728
[17:35:21] [1100]	valid's l2: 91.1265
[17:35:22] [1200]	valid's l2: 90.9752
[17:35:22] [1300]	valid's l2: 90.6905
[17:35:22] [1400]	valid's l2: 90.603
[17:35:22] [1500]	valid's l2: 90.5929
[17:35:22] [1600]	valid's l2: 90.6202
[17:35:23] Early stopping, best iteration is:
[1460]	valid's l2: 90.5074
[17:35:23] [1mSelector_LightGBM[0m fitting and predicting completed
[17:35:23] Start fitting [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m ...
[17:35:23] Training params: {'task': 'train', 'learning_rate': 0.02, 'num_leaves': 32, 'feature_fraction': 0.9, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_a



[17:35:24] [100]	valid's l2: 148.285
[17:35:24] [200]	valid's l2: 110.483
[17:35:24] [300]	valid's l2: 103.46
[17:35:25] [400]	valid's l2: 99.8276
[17:35:25] [500]	valid's l2: 97.3075
[17:35:25] [600]	valid's l2: 95.5466
[17:35:26] [700]	valid's l2: 94.214
[17:35:26] [800]	valid's l2: 93.3279
[17:35:26] [900]	valid's l2: 92.6963
[17:35:26] [1000]	valid's l2: 92.1804
[17:35:27] [1100]	valid's l2: 91.6926
[17:35:27] [1200]	valid's l2: 91.2684
[17:35:27] [1300]	valid's l2: 90.9965
[17:35:28] [1400]	valid's l2: 90.8226
[17:35:28] [1500]	valid's l2: 90.558
[17:35:28] [1600]	valid's l2: 90.4214
[17:35:28] [1700]	valid's l2: 90.4251
[17:35:29] [1800]	valid's l2: 90.3588
[17:35:29] [1900]	valid's l2: 90.2939
[17:35:29] [2000]	valid's l2: 90.3204
[17:35:30] [2100]	valid's l2: 90.251
[17:35:30] Early stopping, best iteration is:
[1954]	valid's l2: 90.2061
[17:35:30] ===== Start working with [1mfold 1[0m for [1mLvl_0_Pipe_0_Mod_0_LightGBM[0m =====
[17:35:30] Training until validation scores d

Тут некоторая тонкость использования метдов **automl.fit_predict()** и **automl.predict()**.

Когда automl обучает модели методом **fit_predict()** это происходит с помощью кросс валидации (в нашем случае на 5 подвыборках из обучающей выборки). 

В процессе этого **automl** будет собирать статистику по фолдам, которые в данный момент являются валидационными и записывать ее.

Поэтому и предсказанные значения по валидационным фолдам и полученные по ней метрики будут получены из метода automl.fit_predict(обучающая_выборка).

automl.predict(данные) использует уже готовый полученный ансамбль моделей МЛ, некакого разбиения на подвыборки тут нет. Его праивльно использовать на тестовой выборке, - тех данных которые не участвовали при обучени модели.

Предсказывать на automl.predict(обучающая_выборка) не правильно, таким образом мы померим метрику на тех данных на которых учили модель. А правильно делать это только на валлидационых фолдах при кросс валидации.

### Сохраним полученный ансамбль моделей в файл

In [19]:
model_path = 'model/lightautoml_model_1.pkl'
oof_pred_path = 'model/lightautoml_model_1_oof_pred_1.pkl'


In [20]:
# сохраним полученный ансамбль моделей и его предсказания на тестовой выборке
joblib.dump(automl, model_path)
joblib.dump(oof_pred, oof_pred_path)


['model/lightautoml_model_1_oof_pred_1.pkl']

### Загрузим сохраненную ранее модель

In [21]:
porog = 0.5

In [22]:
automl = joblib.load(model_path)
oof_pred = joblib.load(oof_pred_path)

In [85]:
# проверка полученного ансамбля на валидационной выборке
valid_pred = automl.predict(valid_data)

In [96]:
y_pred =valid_pred.data[:,0]

In [94]:
y_test = valid_data['critical_temp'].values

In [97]:
# оценка полученной метрики на валидационном наборе данных
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('R2 score:', r2_score(y_test, y_pred))

Mean Absolute Error: 5.308605516434257
Mean Squared Error: 81.30423649828397
R2 score: 0.9285450322630299


## Обучение модели на всей обучающей выборке и предсказание на тестовых данных используем все

In [98]:
# https://lightautoml.readthedocs.io/en/latest/pages/modules/generated/lightautoml.tasks.base.Task.html#lightautoml.tasks.base.Task
task = Task('reg', loss= 'mse', metric = 'mse')

In [99]:
roles = {'target': 'critical_temp'}

In [None]:
automl = TabularAutoML(task = task, 
                    timeout = 3600 * 3, # 3600 секунд = 1 час
                    general_params = {'use_algos': [['lgb']]},
                    lgb_params = {'default_params': {'num_iterations': 5000}},
                    reader_params = {'n_jobs': 12},
                    timing_params ={'mode': 0}
                    )

In [100]:
# обучение этим подходом на всех имеющиеся обучающих данных
oof_pred = automl.fit_predict(train_full, roles = roles, verbose=4)

[18:44:36] Stdout logging level is DEBUG.
[18:44:36] Task: reg

[18:44:36] Start automl preset with listed constraints:
[18:44:36] - time: 10800.00 seconds
[18:44:36] - CPU: 4 cores
[18:44:36] - memory: 16 GB

[18:44:36] [1mTrain data shape: (17010, 168)[0m

[18:44:38] Feats was rejected during automatic roles guess: []
[18:44:38] Layer [1m1[0m train process start. Time left 10798.32 secs
[18:44:38] Training until validation scores don't improve for 200 rounds




[18:44:39] [100]	valid's l2: 153.841
[18:44:39] [200]	valid's l2: 111.656
[18:44:39] [300]	valid's l2: 101.169
[18:44:39] [400]	valid's l2: 95.5813
[18:44:40] [500]	valid's l2: 92.0799
[18:44:40] [600]	valid's l2: 89.2782
[18:44:40] [700]	valid's l2: 87.2193
[18:44:40] [800]	valid's l2: 85.4849
[18:44:40] [900]	valid's l2: 84.0722
[18:44:41] [1000]	valid's l2: 83.0192
[18:44:41] [1100]	valid's l2: 82.2485
[18:44:41] [1200]	valid's l2: 81.4096
[18:44:41] [1300]	valid's l2: 80.7825
[18:44:41] [1400]	valid's l2: 80.1134
[18:44:42] [1500]	valid's l2: 79.6846
[18:44:42] [1600]	valid's l2: 79.2601
[18:44:42] [1700]	valid's l2: 78.9817
[18:44:42] [1800]	valid's l2: 78.6754
[18:44:42] [1900]	valid's l2: 78.4759
[18:44:43] [2000]	valid's l2: 78.3071
[18:44:43] [2100]	valid's l2: 78.1007
[18:44:43] [2200]	valid's l2: 78.0154
[18:44:43] [2300]	valid's l2: 77.7761
[18:44:44] [2400]	valid's l2: 77.7198
[18:44:44] [2500]	valid's l2: 77.575
[18:44:44] [2600]	valid's l2: 77.533
[18:44:44] [2700]	valid



[18:44:47] [100]	valid's l2: 147.898
[18:44:48] [200]	valid's l2: 113.033
[18:44:48] [300]	valid's l2: 104.653
[18:44:48] [400]	valid's l2: 100.258
[18:44:49] [500]	valid's l2: 97.1549
[18:44:49] [600]	valid's l2: 94.877
[18:44:49] [700]	valid's l2: 93.3393
[18:44:50] [800]	valid's l2: 91.7408
[18:44:50] [900]	valid's l2: 90.4431
[18:44:50] [1000]	valid's l2: 89.4647
[18:44:50] [1100]	valid's l2: 88.4667
[18:44:51] [1200]	valid's l2: 87.9961
[18:44:51] [1300]	valid's l2: 87.3115
[18:44:51] [1400]	valid's l2: 86.7987
[18:44:52] [1500]	valid's l2: 86.4094
[18:44:52] [1600]	valid's l2: 86.1542
[18:44:52] [1700]	valid's l2: 85.7433
[18:44:53] [1800]	valid's l2: 85.4066
[18:44:53] [1900]	valid's l2: 85.2315
[18:44:53] [2000]	valid's l2: 85.0053
[18:44:53] [2100]	valid's l2: 84.8408
[18:44:54] [2200]	valid's l2: 84.7798
[18:44:54] [2300]	valid's l2: 84.5689
[18:44:54] [2400]	valid's l2: 84.5066
[18:44:55] [2500]	valid's l2: 84.3265
[18:44:55] [2600]	valid's l2: 84.206
[18:44:55] [2700]	valid

### Сохраним полученный ансамбль моделей в файл

In [101]:
model_path = 'model/lightautoml_model_1_final.pkl'
oof_pred_path = 'model/lightautoml_model_1_oof_pred_final.pkl'

In [102]:
# сохраним полученный ансамбль моделей и его предсказания на тестовой выборке
joblib.dump(automl, model_path)
joblib.dump(oof_pred, oof_pred_path) 

['model/lightautoml_model_1_oof_pred_final.pkl']

### Загрузим сохраненую ранее модель из файла

In [103]:
automl = joblib.load(model_path)
oof_pred = joblib.load(oof_pred_path)

In [106]:
len(oof_pred.data[:, 0])

17010

In [107]:
# предсказание ансамблем моделей на тестовых данных
test_pred = automl.predict(test_full)

In [110]:
preds = test_pred.data[:,0]
preds

array([28.115297 , 27.130775 , 88.709335 , ...,  6.5541735,  1.6765075,
        1.6765075], dtype=float32)

## Предсказание ответа для тестового набора данных

In [111]:
y_pred_test = pd.DataFrame(preds, columns=['critical_temp'])
y_pred_test = y_pred_test.reset_index()

y_pred_test.to_csv("submission/solution.csv", index=False)