In [None]:
from pprint import pprint
from pathlib import Path
import numpy as np  # для чисел и вычислений
import pandas as pd  # для таблиц (как Excel в Python)
import matplotlib.pyplot as plt  # для графиков
import seaborn as sns  # для красивых графиков

from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from clearing import DatasetCleaner  # soft link


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler


In [4]:
MAIN_FOLDER = "/home/arman/it/AI_work/machine/melting_point"
plt.style.use('ggplot')
sns.set_palette("husl")

In [5]:
# Читаем файлы
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/train.csv")  # данные для обучения
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/test.csv")  # данные для теста
sample_sub = pd.read_csv(f"{MAIN_FOLDER}/data/sample_submission.csv")  # шаблон для ответа

# Посмотрим что у нас есть
print(f"Размер обучающих данных: {train_data.shape}")
print(f"Размер тестовых данных: {test_data.shape}")
print("Первые 3 строки обучающих данных:")
print(train_data.head(3))
print("Колонки в данных:")
print(train_data.columns.tolist()[:10], "...")  # первые 10 колонок

Размер обучающих данных: (2662, 427)
Размер тестовых данных: (666, 426)
Первые 3 строки обучающих данных:
     id                       SMILES      Tm  Group 1  Group 2  Group 3  \
0  2175        FC1=C(F)C(F)(F)C1(F)F  213.15        0        0        0   
1  1222  c1ccc2c(c1)ccc3Nc4ccccc4c23  407.15        0        0        0   
2  2994          CCN1C(C)=Nc2ccccc12  324.15        2        1        0   

   Group 4  Group 5  Group 6  Group 7  ...  Group 415  Group 416  Group 417  \
0        0        0        0        0  ...          0          0          0   
1        0        0        0        0  ...          0          0          0   
2        0        0        0        0  ...          0          0          0   

   Group 418  Group 419  Group 420  Group 421  Group 422  Group 423  Group 424  
0          0          0          0          0          0          0          0  
1          0          0          0          0          0          0          0  
2          0          0          

In [6]:
train_cleaner = DatasetCleaner(train_data)

In [21]:
cols_to_remove = ["SMILES"]

for col in train_cleaner.current_df.columns.to_list():
    t = train_cleaner.count_missing_and_zeros(column=col)
    if t[1] == t[2]:
        cols_to_remove.append(col)
        print("DELETE", col, t)
# нулей нет

In [8]:
print(cols_to_remove)

['id', 'SMILES', 'Group 12', 'Group 28', 'Group 46', 'Group 67', 'Group 73', 'Group 74', 'Group 75', 'Group 84', 'Group 85', 'Group 88', 'Group 90', 'Group 101', 'Group 102', 'Group 104', 'Group 150', 'Group 152', 'Group 155', 'Group 158', 'Group 160', 'Group 167', 'Group 183', 'Group 194', 'Group 198', 'Group 206', 'Group 207', 'Group 208', 'Group 209', 'Group 212', 'Group 213', 'Group 214', 'Group 215', 'Group 216', 'Group 217', 'Group 218', 'Group 245', 'Group 247', 'Group 248', 'Group 250', 'Group 252', 'Group 253', 'Group 264', 'Group 280', 'Group 281', 'Group 282', 'Group 285', 'Group 294', 'Group 303', 'Group 306', 'Group 307', 'Group 308', 'Group 309', 'Group 312', 'Group 313', 'Group 316', 'Group 317', 'Group 340', 'Group 342', 'Group 345', 'Group 347', 'Group 348', 'Group 349', 'Group 350', 'Group 352', 'Group 355', 'Group 356', 'Group 357', 'Group 358', 'Group 360', 'Group 363', 'Group 371', 'Group 376', 'Group 377', 'Group 383', 'Group 384', 'Group 385', 'Group 390', 'Group

In [9]:
train_cleaner.find_duplicates()
# дубликатов нет

Unnamed: 0,id,SMILES,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,...,Group 415,Group 416,Group 417,Group 418,Group 419,Group 420,Group 421,Group 422,Group 423,Group 424


In [10]:
for col in cols_to_remove:
    train_cleaner.remove_row_or_column(column=col)
train_cleaner.current_df.head()

Unnamed: 0,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,Group 9,...,Group 406,Group 407,Group 408,Group 409,Group 410,Group 412,Group 414,Group 415,Group 416,Group 418
0,213.15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,407.15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,324.15,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,351.15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,126.15,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
def df_algorithm(df):
    df_cleaner = DatasetCleaner(df)
    for col in cols_to_remove:
        df_cleaner.remove_row_or_column(column=col)
    return df_cleaner.current_df

In [12]:
print(len(cols_to_remove), cols_to_remove)
train_cleaner.save_to_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")

89 ['id', 'SMILES', 'Group 12', 'Group 28', 'Group 46', 'Group 67', 'Group 73', 'Group 74', 'Group 75', 'Group 84', 'Group 85', 'Group 88', 'Group 90', 'Group 101', 'Group 102', 'Group 104', 'Group 150', 'Group 152', 'Group 155', 'Group 158', 'Group 160', 'Group 167', 'Group 183', 'Group 194', 'Group 198', 'Group 206', 'Group 207', 'Group 208', 'Group 209', 'Group 212', 'Group 213', 'Group 214', 'Group 215', 'Group 216', 'Group 217', 'Group 218', 'Group 245', 'Group 247', 'Group 248', 'Group 250', 'Group 252', 'Group 253', 'Group 264', 'Group 280', 'Group 281', 'Group 282', 'Group 285', 'Group 294', 'Group 303', 'Group 306', 'Group 307', 'Group 308', 'Group 309', 'Group 312', 'Group 313', 'Group 316', 'Group 317', 'Group 340', 'Group 342', 'Group 345', 'Group 347', 'Group 348', 'Group 349', 'Group 350', 'Group 352', 'Group 355', 'Group 356', 'Group 357', 'Group 358', 'Group 360', 'Group 363', 'Group 371', 'Group 376', 'Group 377', 'Group 383', 'Group 384', 'Group 385', 'Group 390', 'Gr

In [13]:
train_data = pd.read_csv(f"{MAIN_FOLDER}/data/prepared_train.csv")  # данные для обучения
train_cleaner = DatasetCleaner(train_data)
train_cleaner.current_df.head()

Unnamed: 0,Tm,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,Group 9,...,Group 406,Group 407,Group 408,Group 409,Group 410,Group 412,Group 414,Group 415,Group 416,Group 418
0,213.15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,407.15,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,324.15,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,351.15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,126.15,2,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
train_train_data_x, train_test_data_x, train_train_data_y, train_test_data_y = train_cleaner.split_data(target_column="Tm")

In [15]:
print(type(train_train_data_x), type(train_train_data_y))
pprint(train_train_data_x.head())
pprint(train_train_data_y.head())

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.series.Series'>
      Group 1  Group 2  Group 3  Group 4  Group 5  Group 6  Group 7  Group 8  \
874         2        8        0        0        0        0        0        0   
2130        2       25        0        0        0        0        0        0   
1882        1        0        0        0        0        0        0        0   
2066        2        4        0        1        0        0        0        0   
1614        0        2        1        0        0        0        0        0   

      Group 9  Group 10  ...  Group 406  Group 407  Group 408  Group 409  \
874         0         0  ...          0          0          0          0   
2130        0         0  ...          0          0          0          0   
1882        0         0  ...          0          0          0          0   
2066        0         0  ...          0          0          0          0   
1614        0         0  ...          0          0          0    

In [16]:
# №5 базовые модели

print("" + "="*50)
print("ШАГ 5: Пробуем разные модели (как разные рабочие)")
print("="*50)

# Создадим несколько моделей
models = {
    'Ridge': Ridge(alpha=1.0, max_iter=10000, random_state=42),
    'Lasso': Lasso(alpha=0.01, max_iter=10000, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=1000, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=1000, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=1000, random_state=42),
}

# Будем хранить результаты
results = {}

print("Начинаем обучение моделей...")
print("-" * 50)

for name, model in models.items():
    print(f"Обучаем модель: {name}")
    
    # Обучаем модель
    model.fit(train_train_data_x, train_train_data_y)
    
    # Делаем предсказания
    train_test_data_y_pred = model.predict(train_test_data_x)
    
    # Оцениваем качество
    mae = mean_absolute_error(train_test_data_y, train_test_data_y_pred)
    r2 = r2_score(train_test_data_y, train_test_data_y_pred)
    
    print(f"  Ошибка на обучении (MAE): {mae:.2f} K")
    print(f"  R² на проверке: {r2:.3f}")




ШАГ 5: Пробуем разные модели (как разные рабочие)
Начинаем обучение моделей...
--------------------------------------------------
Обучаем модель: Ridge


  Ошибка на обучении (MAE): 37.14 K
  R² на проверке: 0.601
Обучаем модель: Lasso
  Ошибка на обучении (MAE): 36.28 K
  R² на проверке: 0.611
Обучаем модель: Random Forest
  Ошибка на обучении (MAE): 36.60 K
  R² на проверке: 0.585
Обучаем модель: Gradient Boosting
  Ошибка на обучении (MAE): 35.75 K
  R² на проверке: 0.633
Обучаем модель: XGBoost
  Ошибка на обучении (MAE): 33.49 K
  R² на проверке: 0.609


In [17]:
test_data = pd.read_csv(f"{MAIN_FOLDER}/data/test.csv")  # данные для теста
test_ids = list(test_data["id"])
print(test_ids)
test_data = df_algorithm(test_data)
test_data.head()

[1022, 1146, 79, 2279, 1342, 2082, 29, 515, 2309, 1177, 553, 179, 1441, 1457, 1085, 2344, 848, 176, 3282, 2743, 2308, 2648, 71, 498, 366, 193, 18, 9, 684, 558, 2831, 2443, 2795, 714, 2626, 613, 2380, 2459, 1619, 2176, 1599, 2992, 1715, 2689, 873, 1039, 3213, 3066, 2702, 1339, 2332, 1758, 2198, 1324, 1386, 1322, 846, 1145, 992, 416, 243, 1197, 2519, 1269, 2632, 1070, 2410, 3037, 2868, 3130, 283, 1891, 2553, 1700, 1719, 2880, 2731, 752, 434, 126, 3252, 2132, 51, 780, 3269, 804, 2028, 555, 3183, 770, 501, 623, 499, 1896, 224, 1620, 2952, 2092, 1602, 290, 1687, 1153, 957, 2468, 3314, 1224, 2517, 921, 964, 1740, 2361, 1660, 2522, 519, 2156, 2159, 3050, 453, 2595, 2704, 436, 130, 81, 1027, 2797, 2669, 2223, 2031, 397, 3010, 2767, 396, 1693, 1432, 1422, 3132, 1247, 1965, 703, 678, 1221, 1638, 3311, 1329, 610, 3198, 2805, 2586, 2635, 1868, 2301, 1994, 1472, 1944, 3107, 1856, 2185, 3312, 1686, 1087, 304, 159, 1782, 582, 423, 101, 3321, 2638, 455, 1910, 1402, 2364, 3047, 2516, 215, 1471, 2806, 2

Unnamed: 0,Group 1,Group 2,Group 3,Group 4,Group 5,Group 6,Group 7,Group 8,Group 9,Group 10,...,Group 406,Group 407,Group 408,Group 409,Group 410,Group 412,Group 414,Group 415,Group 416,Group 418
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,7,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
def make_result(model):
    result = pd.DataFrame({"id": [], "Tm": []})

    for i in range(0, len(test_data)):
        current_id = test_ids[i]
        # Slice the DataFrame
        test_slice = test_data.iloc[i:i+1]
        y = model.predict(test_slice)[0]
        adding_pd = pd.DataFrame({"id": [current_id], "Tm": [y]})
        result = pd.concat([result, adding_pd])

    pprint(result.head())
    result['id'] = result['id'].astype(int)
    DatasetCleaner(result).save_to_csv(f"{MAIN_FOLDER}/data/submission.csv")


In [19]:
make_result(models["Gradient Boosting"])

       id          Tm
0  1022.0  343.272942
0  1146.0  300.060197
0    79.0  206.973832
0  2279.0  206.450729
0  1342.0  235.710930
