<a href="https://colab.research.google.com/github/napchick/mos_real_estate_bot/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
import time
import pickle
from scipy import stats

In [None]:
!pip install optuna
import optuna



# Датасет: https://www.kaggle.com/datasets/egorkainov/moscow-housing-price-dataset

In [3]:
data2 = pd.read_csv('data 2.csv')

In [4]:
data2

Unnamed: 0,Price,Apartment type,Metro station,Minutes to metro,Region,Number of rooms,Area,Living area,Kitchen area,Floor,Number of floors,Renovation
0,6300000.0,Secondary,Опалиха,6.0,Moscow region,1.0,30.60,11.1,8.5,25.0,25,Cosmetic
1,9000000.0,Secondary,Павшино,2.0,Moscow region,1.0,49.20,20.0,10.0,6.0,15,European-style renovation
2,11090000.0,Secondary,Мякинино,14.0,Moscow region,1.0,44.70,16.2,13.1,10.0,25,Cosmetic
3,8300000.0,Secondary,Строгино,8.0,Moscow region,1.0,35.10,16.0,11.0,12.0,33,European-style renovation
4,6450000.0,Secondary,Опалиха,6.0,Moscow region,1.0,37.70,15.2,4.0,5.0,5,Without renovation
...,...,...,...,...,...,...,...,...,...,...,...,...
22671,4768792.0,New building,Котельники,8.0,Moscow region,0.0,31.75,13.0,5.0,4.0,17,Cosmetic
22672,5900000.0,New building,Домодедовская,25.0,Moscow region,1.0,31.60,10.1,12.2,11.0,15,Cosmetic
22673,3700000.0,New building,Котельники,30.0,Moscow region,0.0,18.00,15.0,8.1,17.0,17,Cosmetic
22674,5768869.0,New building,Жулебино,14.0,Moscow region,2.0,36.39,22.0,6.6,12.0,14,Cosmetic


# Работа с данными

In [None]:
# проверка на наличие пропусков
data2.isna().sum()

Unnamed: 0,0
Price,0
Apartment type,0
Metro station,0
Minutes to metro,0
Region,0
Number of rooms,0
Area,0
Living area,0
Kitchen area,0
Floor,0


In [5]:
# переименуем столбцы для более удобной работы
data2 = data2.rename(columns = {'Apartment type' : 'is_new', 'Metro station' : 'metro', 'Minutes to metro' : 'minutes', 'Number of rooms' : 'rooms', 'Living area' : 'liv_area',
                              'Kitchen area' : 'kit_area', 'Number of floors' : 'num_of_floors', 'Price' : 'price', 'Region':'is_moscow', 'Floor':'floor', 'Renovation':'renovation', 'Area' : 'area'})

# выбросим дубликаты
data2 = data2.drop_duplicates()
# закодируем столбцы, которые имеют только 2 значения
data2['is_new'] = data2['is_new'].apply(lambda x: 1 if x == 'New building' else 0)
data2['is_moscow'] = data2['is_moscow'].apply(lambda x: 1 if x == 'Moscow' else 0)

In [None]:
# уберем выбросы
data2 = data2[(np.abs(stats.zscore(data2['price'])) < 3)]

In [None]:
data2['metro'] = data2['metro'].str.lower().str.strip()

In [None]:
# создадим списки административных округов москвы и заполним их станциями метро
vao = ["aвиамоторная", "андроновка", "белокаменная", "бульвар рокоссовского", "выхино", "измайлово", "измайловская", "кусоково", "локомотив", "лухмановская", "митьково", "новогиреево", "новокосино", "партизанская", "первомайская", "перово", "преображенская площадь", "семеновская", "соколиная гора", "сокольники", "улица дмитриевского", "черкизовская", "шоссе энтузиастов", "щелковская", "электрозаводская", "щёлковская"]
zao = ["филёвский парк", "немчиновка", "тропарёво", "аминьевская", "аэропорт внуково", "багратионовская", "боровское шоссе", "давыдково", "киевская", "кропоткинская", "крылатское", "кунцевская", "кутузовская", "ломоносовский проспект", "матвеевская", "мещерская", "минская", "мичуринский проспект", "молодежная", "молодёжная", "новаторская", "новопеределкино", "озёрная", "очаково", "парк победы", "переделкино", "пионерская", "поклонная", "проспект вернадского", "рабочий посёлок", "раменки", "сетунь", "славянский бульвар", "солнечная", "солнцево", "студенческая", "тропарево", "филевский парк", "фили", "юго-западная"]
nmao = ["коммунарка", "внуково", "говорово", "кокошкино", "крёкшино", "мичиуринец", "новомосковская", "ольховая", "остафьево", "потапово", "прокшино", "пыхтино", "рассказовка", "румянцево", "саларьево", "санино", "толстопальцево", "тютчевская", "филатов луг", "щербинка"]
sao = ["новодачная","аэропорт", "балтийская", "беговая", "беломорская", "бескудниково", "верхние лихоборы", "водный стадион", "войковская", "гражданская", "грачёвская", "дегунино", "динамо", "дмитровская", "зорге", "коптево", "красный балтиец", "лихоборы", "марк", "молжаниново", "москова-сити", "моссельмаш", "новоподрезково", "окружная", "панфиловская", "петровский парк", "петровско-разумовская", "полежаевская", "речной вокзал", "савеловская", "селигерская", "сокол", "стрешнево", "тимирязевская", "ховрино", "хорошёво", "хорошево", "хорошёвская", "цска", "яхромская"]
svao = ["марьина роща (шереметьевская)","алексеевская", "алтуфьево", "бабушкинская", "бибирево", "ботанический сад", "бутырская", "вднх", "владыкино", "выставочный центр", "лианозово", "марьина роща", "медведково", "окружная", "останкино", "отрадное", "ростокино", "савеловская", "савёловская", "свиблово", "телецентр", "улица академика королёва", "улица милашенкова", "улица сергея эйзенштейна", "физтех", "фонвизинская"]
szao = ["нахабино", "аникеевка", "мякинино", "красногорская","павшино", "опалиха", "волоколамская", "митино", "мнёвники", "народное ополчение", "октябрьское поле", "пенягино", "планерная", "пятницкое шоссе", "спартак", "строгино", "сходненская", "терехово", "трикотажная", "тушинская", "щукинская", "карамышевская"]
cao = ["международная", "деловой центр","александровский сад", "арбатская", "баррикадная", "бауманская", "беговая", "белорусская", "библиотека имени ленина", "боровицкая", "воробьевы горы", "выставочная", "добрынинская", "достоевская", "китай-город", "комсомольская", "краснопресненская", "красносельская", "красные ворота", "крестьянская застава", "кузнецкий мост", "курская", "лубянка", "лужники", "марксистская", "маяковская", "менделеевская", "москва-сити", "новокузнецкая", "новослободская", "октябрьская", "охотный ряд", "павелецкая", "парк культуры", "площадь ильича", "площадь революции", "площадь трёх вокзалов", "полянка", "пролетарская", "проспект мира", "пушкинская", "рижская", "римская", "серп и молот", "серпуховская", "смоленская", "спортивная", "сретенский бульвар", "сухаревская", "таганская", "тверская", "театральная", "тестовская", "третьяковская", "трубная", "тургеневская", "улица 1905 года", "фрунзенская", "цветной бульвар", "чеховская", "чистые пруды", "чкаловская", "шелепиха", "электрозаводская", "воробьёвы горы", "библиотека им. ленина", "библиотека и ленина"]
uvao = ["авиамоторная", "андроновка", "братиславская", "битца", "волгоградский проспект", "волжская", "выхино", "депо", "дубровка", "жулебино", "калитники", "кожуховская", "косино", "котельники", "кузьминки", "курьяново", "лермонтовский проспект", "лефортово", "люблино", "марьино", "москва-товарная", "некрасовка", "нижегородская", "новохохловская", "окская", "перерва", "печатники", "плющево", "рязанский проспект", "сортировочная", "стахановская", "текстильщики", "угрешская", "чухлинка", "юго-восточная"]
uzao = ["тёплый стан", "силикатная","академическая", "беляево", "битцевский парк", "бульвар адмирала ушакова", "бульвар дмитрия донского", "бунинская аллея", "бутово", "воронцовская", "генерала тюленева", "зюзино", "калужская", "каховская", "коньково", "нахимовский проспект", "новаторская", "новоясеневская", "новые черемушки", "площадь гагарина", "профсоюзная", "севастопольская", "теплый стан", "улица горчакова", "улица скобелевская", "улица старокачаловская", "университет", "университет дружбы народов", "ясенево", "новые черёмушки"]
uao = ["варшавская (коломенское)", "автозаводская", "автозаводская", "алма-атинская", "аннино", "борисово", "варшавская", "верхние котлы", "домодедовская", "зил", "зябликово", "кантемировская", "каширская", "каширская", "кленовый бульвар", "коломенская", "красногвардейская", "красный строитель", "крымская", "ленинский проспект", "лесопарковая", "москворечье", "нагатинская", "нагатинский затон", "нагорная", "орехово", "покровское", "пражская", "технопарк", "тульская", "улица академика янгеля", "царицыно", "царицыно", "чертановская", "шаболовская", "шипиловская", "южная"]





In [None]:
# сделаем словарь с административными районами
rayons = {'vao' : vao, 'zao' : zao, 'nmao' : nmao, 'sao' : sao, 'svao' : svao,
          'szao' : szao, 'cao' : cao, 'uvao' : uvao, 'uzao' : uzao, 'uao' : uao}

In [None]:
# создадим новый столбец для определения административного района квартиры
data2['district'] = '-'

In [None]:
# функция, которая по метро определяет ее АО
def find(value):
  for ray in rayons:
    if value in rayons[ray]:
      return ray

data2['district'] = data2['metro'].apply(find)

In [None]:
# удалим столбец с живой площадью, тк он линейно зависим с общей площадью
data2.drop(columns=['liv_area'], inplace=True)

In [None]:
data2

Unnamed: 0,price,is_new,metro,minutes,is_moscow,rooms,area,kit_area,floor,num_of_floors,renovation,district
0,6300000.0,0,опалиха,6.0,0,1.0,30.60,8.5,25.0,25,Cosmetic,szao
1,9000000.0,0,павшино,2.0,0,1.0,49.20,10.0,6.0,15,European-style renovation,szao
2,11090000.0,0,мякинино,14.0,0,1.0,44.70,13.1,10.0,25,Cosmetic,szao
3,8300000.0,0,строгино,8.0,0,1.0,35.10,11.0,12.0,33,European-style renovation,szao
4,6450000.0,0,опалиха,6.0,0,1.0,37.70,4.0,5.0,5,Without renovation,szao
...,...,...,...,...,...,...,...,...,...,...,...,...
22670,5884024.0,1,котельники,8.0,0,1.0,44.17,10.3,4.0,17,Cosmetic,uvao
22672,5900000.0,1,домодедовская,25.0,0,1.0,31.60,12.2,11.0,15,Cosmetic,uao
22673,3700000.0,1,котельники,30.0,0,0.0,18.00,8.1,17.0,17,Cosmetic,uvao
22674,5768869.0,1,жулебино,14.0,0,2.0,36.39,6.6,12.0,14,Cosmetic,uvao


In [None]:
#в москве
data2[data2['is_moscow'] == 1].groupby(['district'])['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
cao,2118.0,84761070.0,67928590.0,2390000.0,29000000.0,64427206.5,126429202.0,272402200.0
nmao,664.0,13450370.0,7332338.0,2650000.0,8487500.0,11495000.0,15912500.0,85000000.0
sao,1530.0,29651080.0,32411520.0,2600000.0,11500000.0,18990000.0,33597725.0,250000000.0
svao,764.0,17885290.0,16176760.0,2700000.0,8512067.5,13800000.0,21000000.0,165000000.0
szao,627.0,33958520.0,41312890.0,3600000.0,13495000.0,19900000.0,35999500.0,258000000.0
uao,1908.0,22256610.0,19775130.0,1150000.0,12202617.5,16479744.0,24000000.0,230000000.0
uvao,3426.0,10726980.0,5156017.0,1420000.0,7968030.0,10016580.0,12001582.5,56000000.0
uzao,759.0,23694800.0,27824090.0,1750000.0,10000000.0,15200000.0,24920000.0,223807250.0
vao,758.0,15736520.0,13657720.0,2799000.0,8699250.0,12500000.0,17500000.0,103891410.0
zao,1798.0,55329530.0,57827000.0,2900000.0,16402500.0,32250000.0,69900000.0,270000000.0


Подели Москву на 3 района:

- zao, cao, szao
- sao, uao, uzao
- nmao, uvao, vao, svao


In [None]:
#в области
data2[data2['is_moscow'] == 0].groupby(['district'])['price'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
district,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
szao,634.0,10789640.0,4432776.0,4200000.0,8006426.0,10195000.0,12385955.25,53000000.0
uao,3339.0,6542585.0,2200947.0,1939125.0,5182880.5,6326844.0,7662490.0,18877250.0
uvao,1986.0,6724485.0,2174600.0,2261523.0,5261100.0,6251150.0,7821967.5,24340000.0
uzao,109.0,6596419.0,2059595.0,3386249.0,5428951.0,6071913.0,7413824.0,16500000.0


In [None]:
# Делим наши данные на районы
moscow_high = data2[(data2['is_moscow'] == 1) & ((data2['district'] == 'zao') | (data2['district'] == 'cao'))]
moscow_medium = data2[(data2['is_moscow'] == 1) & ((data2['district'] == 'sao') | (data2['district'] == 'szao') | (data2['district'] == 'uzao') | (data2['district'] == 'uao'))]
moscow_low = data2[(data2['is_moscow'] == 1) & ((data2['district'] == 'nmao') | (data2['district'] == 'uvao') | (data2['district'] == 'vao') | (data2['district'] == 'svao'))]
region = data2[data2['is_moscow'] == 0]


In [None]:
# удаляем столбцы индекс москвы и. метро, тк они больше не нужны
moscow_high.drop(columns=['is_moscow', 'metro'], inplace=True)
moscow_medium.drop(columns=['is_moscow', 'metro'], inplace=True)
moscow_low.drop(columns=['is_moscow', 'metro'], inplace=True)
region.drop(columns=['is_moscow', 'metro', 'district'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  moscow_high.drop(columns=['is_moscow', 'metro'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  moscow_medium.drop(columns=['is_moscow', 'metro'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  moscow_low.drop(columns=['is_moscow', 'metro'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

In [None]:
prices = pd.concat([moscow_high['price'].describe(), moscow_medium['price'].describe(), moscow_low['price'].describe(), region['price'].describe() ], axis=1)

In [None]:
# сравним цены в разных районах
prices.columns = ['high', 'medium', 'low', 'region']
prices

Unnamed: 0,high,medium,low,region
count,3916.0,4824.0,5612.0,6068.0
mean,71247810.0,26349110.0,12700340.0,7046829.0
std,65155130.0,29147520.0,9522664.0,2824807.0
min,2390000.0,1150000.0,1420000.0,1939125.0
25%,21175000.0,11900000.0,8059320.0,5325677.0
50%,47900000.0,17000000.0,10765880.0,6580620.0
75%,100000000.0,28027500.0,14000000.0,8112109.0
max,272402200.0,258000000.0,165000000.0,53000000.0


In [None]:
moscow_high.to_csv('moscow_high.csv')
moscow_medium.to_csv('moscow_medium.csv')
moscow_low.to_csv('moscow_low.csv')
region.to_csv('region.csv')

# Catboost для moscow_high

In [None]:
!pip uninstall catboost
!pip install catboost

[0mCollecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import catboost
from catboost import *

In [None]:
X_ch = moscow_high.drop(columns=['price'])
y_ch = moscow_high.price

X_train_high_c, X_test_high_c, y_train_high_c, y_test_high_c = train_test_split(X_ch, y_ch, train_size = 0.7, shuffle=True, random_state=13)
cat_features = [7, 8]

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    loss_function='Quantile:alpha=0.25',
    use_best_model=True,
    random_seed=13
)
model.fit(
    X_train_high_c, y_train_high_c,
    cat_features=cat_features,
    eval_set=(X_test_high_c, y_test_high_c),
    verbose=False
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())

Model is fitted: True
Model params:
{'iterations': 1000, 'learning_rate': 0.1, 'loss_function': 'Quantile:alpha=0.25', 'random_seed': 13, 'use_best_model': True}


In [None]:
# функция, которая обучает модель и выводит ошибку
def mlf_c(alg, X_train, X_test, y_train, y_test, cat_features, early_stopping_rounds=50):

  #Fit the algorithm on the data
  alg.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_test, y_test),
    verbose=False
  )

  #Predict training set:
  dtrain_predictions = alg.predict(X_train)

  #Print model report:
  print("\nModel Report")
  print(f"RMSE for train: {metrics.root_mean_squared_error(y_train, dtrain_predictions)}")
  print(f"r2 for train: {metrics.r2_score(y_train, dtrain_predictions)}, \n")

  #Predict on testing data:
  dtest_predictions = alg.predict(X_test)
  print(f"RMSE for test: {metrics.root_mean_squared_error(y_test, dtest_predictions)}")
  print(f"r2 for test: {metrics.r2_score(y_test, dtest_predictions)}")

In [None]:
mlf_c(model, X_train_high_c, X_test_high_c, y_train_high_c, y_test_high_c, cat_features)


Model Report
RMSE for train: 34464417.160647705
r2 for train: 0.7215839238111983, 

RMSE for test: 38121996.72611432
r2 for test: 0.653248603561732


In [None]:
# функция библиотеки optuna, которая позволяет перебирать различные показатели параметров и находить лучшее сочетание
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 500, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "depth": trial.suggest_int("depth", 7, 13),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 45),
    }

    model = CatBoostRegressor(**params, loss_function='Quantile:alpha=0.7', silent=True, random_seed=13)
    model.fit(X_train_high_c, y_train_high_c, cat_features=cat_features, eval_set=(X_test_high_c, y_test_high_c))
    predictions = model.predict(X_test_high_c)
    rmse = metrics.mean_squared_error(y_test_high_c, predictions, squared=False)
    return rmse

In [None]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

[I 2024-12-10 15:26:14,789] A new study created in memory with name: no-name-80029f44-bd38-4f62-9ad8-00116ff2b84c
[I 2024-12-10 15:26:20,054] Trial 0 finished with value: 33488972.249540295 and parameters: {'iterations': 723, 'learning_rate': 0.014031421737107834, 'depth': 8, 'subsample': 0.7337361427745361, 'colsample_bylevel': 0.9637541778906107, 'min_data_in_leaf': 26}. Best is trial 0 with value: 33488972.249540295.
[I 2024-12-10 15:26:54,860] Trial 1 finished with value: 35798956.43484274 and parameters: {'iterations': 897, 'learning_rate': 0.0014843787882878462, 'depth': 13, 'subsample': 0.7459788645410093, 'colsample_bylevel': 0.65961666024923, 'min_data_in_leaf': 33}. Best is trial 0 with value: 33488972.249540295.
[I 2024-12-10 15:27:57,767] Trial 2 finished with value: 32865122.38397221 and parameters: {'iterations': 981, 'learning_rate': 0.002404553507865596, 'depth': 13, 'subsample': 0.6553589995246654, 'colsample_bylevel': 0.971477544172124, 'min_data_in_leaf': 17}. Best i

In [None]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'iterations': 634, 'learning_rate': 0.060779009831171696, 'depth': 13, 'subsample': 0.782229238657602, 'colsample_bylevel': 0.6153297786329257, 'min_data_in_leaf': 10}
Best RMSE: 30056297.482409194


In [None]:
################# itog ########################
model2 = CatBoostRegressor(iterations=1187, learning_rate=0.02819599939563895, depth=13, colsample_bylevel=0.9493374291903622,
                          subsample = 0.7306108384522604, min_data_in_leaf=17, loss_function='Quantile:alpha=0.7',
                           use_best_model=True, random_seed=13)
mlf_c(model2, X_train_high_c, X_test_high_c, y_train_high_c, y_test_high_c, cat_features)


Model Report
RMSE for train: 12971487.376599036
r2 for train: 0.960560446787156, 

RMSE for test: 30027005.21001366
r2 for test: 0.7848749531348375


In [None]:
# сохраняем модель в файл
filename = 'catboost_high'
pickle.dump(model2, open(filename, 'wb'))

In [None]:
# проверка работы файла с моделью
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_high_c)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_high_c, y_pred)))

root mean squared error :  30027005.21001366


#Catboost для moscow_medium

In [None]:
X_cm = moscow_medium.drop(columns=['price'])
y_cm = moscow_medium.price

X_train_medium_c, X_test_medium_c, y_train_medium_c, y_test_medium_c = train_test_split(X_cm, y_cm, train_size = 0.7, shuffle=True, random_state=13)
cat_features = [7, 8]

In [None]:
model_cm = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    loss_function='Quantile:alpha=0.75',
    use_best_model=True,
    random_seed=13
)
model_cm.fit(
    X_train_medium_c, y_train_medium_c,
    cat_features=cat_features,
    eval_set=(X_test_medium_c, y_test_medium_c),
    verbose=False
)
print('Model is fitted: ' + str(model_cm.is_fitted()))
print('Model params:')
print(model_cm.get_params())

Model is fitted: True
Model params:
{'iterations': 1000, 'learning_rate': 0.1, 'loss_function': 'Quantile:alpha=0.75', 'random_seed': 13, 'use_best_model': True}


In [None]:
mlf_c(model_cm, X_train_medium_c, X_test_medium_c, y_train_medium_c, y_test_medium_c, cat_features)


Model Report
RMSE for train: 10521008.840880152
r2 for train: 0.930705072679851, 

RMSE for test: 14514808.326629099
r2 for test: 0.8535167624101687


In [None]:
def objective_cm(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "depth": trial.suggest_int("depth", 3, 15),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 55),
    }

    model = CatBoostRegressor(**params, loss_function='Quantile:alpha=0.7', silent=True,  random_seed=13)
    model.fit(X_train_medium_c, y_train_medium_c, cat_features=cat_features, eval_set=(X_test_medium_c, y_test_medium_c))
    predictions = model.predict(X_test_medium_c)
    rmse = metrics.mean_squared_error(y_test_medium_c, predictions, squared=False)
    return rmse

In [None]:
study_cm = optuna.create_study(direction='minimize')
study_cm.optimize(objective_cm, n_trials=30)

[I 2024-12-10 19:24:00,064] A new study created in memory with name: no-name-3152bf0f-f508-4c86-a4c0-575db698b38e
[I 2024-12-10 19:25:44,017] Trial 0 finished with value: 12269975.81640367 and parameters: {'iterations': 428, 'learning_rate': 0.09439360147381169, 'depth': 14, 'subsample': 0.6620871162499453, 'colsample_bylevel': 0.7571423432296305, 'min_data_in_leaf': 13}. Best is trial 0 with value: 12269975.81640367.
[I 2024-12-10 19:25:45,068] Trial 1 finished with value: 23646371.061598014 and parameters: {'iterations': 433, 'learning_rate': 0.0005090892720085384, 'depth': 3, 'subsample': 0.8818828220714919, 'colsample_bylevel': 0.71301094418426, 'min_data_in_leaf': 2}. Best is trial 0 with value: 12269975.81640367.
[I 2024-12-10 19:25:57,888] Trial 2 finished with value: 24771437.093034305 and parameters: {'iterations': 1071, 'learning_rate': 0.00010119649590268733, 'depth': 9, 'subsample': 0.7109311181233828, 'colsample_bylevel': 0.695292049361063, 'min_data_in_leaf': 5}. Best is 

In [None]:
print('Best hyperparameters:', study_cm.best_params)
print('Best RMSE:', study_cm.best_value)

Best hyperparameters: {'iterations': 933, 'learning_rate': 0.020555797321524513, 'depth': 11, 'subsample': 0.993428256907132, 'colsample_bylevel': 0.6615245306141675, 'min_data_in_leaf': 19}
Best RMSE: 10804278.551622849


In [None]:
################# itog ########################
model_cm1 = CatBoostRegressor(iterations=933, learning_rate=0.020555797321524513, depth=11, colsample_bylevel=0.6615245306141675,
                          subsample = 0.993428256907132, min_data_in_leaf=19, loss_function='Quantile:alpha=0.7',
                             use_best_model=True, random_seed=13)
mlf_c(model_cm1, X_train_medium_c, X_test_medium_c, y_train_medium_c, y_test_medium_c, cat_features)


Model Report
RMSE for train: 6694789.273850664
r2 for train: 0.950812142841678, 

RMSE for test: 10804278.551622849
r2 for test: 0.8342309826192591


In [None]:
filename = 'catboost_medium'
pickle.dump(model_cm1, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_medium_c)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_medium_c, y_pred)))

root mean squared error :  10804278.551622849


# Catboost для moscow_low

In [None]:
X_cl = moscow_low.drop(columns=['price'])
y_cl = moscow_low.price

X_train_low_c, X_test_low_c, y_train_low_c, y_test_low_c = train_test_split(X_cl, y_cl, train_size = 0.7, shuffle=True, random_state=13)
cat_features = [7, 8]

In [None]:
model_cl = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    use_best_model=True,
    random_seed=13
)
model_cl.fit(
    X_train_low_c, y_train_low_c,
    cat_features=cat_features,
    eval_set=(X_test_low_c, y_test_low_c),
    verbose=False
)
print('Model is fitted: ' + str(model_cl.is_fitted()))
print('Model params:')
print(model_cl.get_params())

Model is fitted: True
Model params:
{'iterations': 1000, 'learning_rate': 0.1, 'loss_function': 'RMSE', 'random_seed': 13, 'use_best_model': True}


In [None]:
mlf_c(model_cl, X_train_low_c, X_test_low_c, y_train_low_c, y_test_low_c, cat_features)

NameError: name 'model_cl' is not defined

In [None]:
def objective_cl(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 650),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 5, 11),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.7, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 25),
    }

    model = CatBoostRegressor(**params, loss_function='Quantile:alpha=0.7', silent=True, random_seed=13)
    model.fit(X_train_low_c, y_train_low_c, cat_features=cat_features, eval_set=(X_test_low_c, y_test_low_c))
    predictions = model.predict(X_test_low_c)
    rmse = metrics.mean_squared_error(y_test_low_c, predictions, squared=False)
    return rmse

In [None]:
study_cl = optuna.create_study(direction='minimize')
study_cl.optimize(objective_cl, n_trials=30)

[I 2024-12-10 20:33:45,595] A new study created in memory with name: no-name-5da96df7-ab9f-436f-ad9e-4c389fc43ca3
[I 2024-12-10 20:33:53,716] Trial 0 finished with value: 5270124.45834105 and parameters: {'iterations': 416, 'learning_rate': 0.003594737322683029, 'depth': 9, 'subsample': 0.8576353980884295, 'colsample_bylevel': 0.9457291707149595, 'min_data_in_leaf': 6}. Best is trial 0 with value: 5270124.45834105.
[I 2024-12-10 20:33:58,530] Trial 1 finished with value: 3431535.0947035034 and parameters: {'iterations': 354, 'learning_rate': 0.019898070899084533, 'depth': 9, 'subsample': 0.6980919859945106, 'colsample_bylevel': 0.9278624059993491, 'min_data_in_leaf': 6}. Best is trial 1 with value: 3431535.0947035034.
[I 2024-12-10 20:34:04,270] Trial 2 finished with value: 3037490.819390141 and parameters: {'iterations': 378, 'learning_rate': 0.052998521211085066, 'depth': 10, 'subsample': 0.8067644006212005, 'colsample_bylevel': 0.900255049901469, 'min_data_in_leaf': 14}. Best is tri

In [None]:
print('Best hyperparameters:', study_cl.best_params)
print('Best RMSE:', study_cl.best_value)

Best hyperparameters: {'iterations': 511, 'learning_rate': 0.0554777497089571, 'depth': 7, 'subsample': 0.7485132388324797, 'colsample_bylevel': 0.8968616894114306, 'min_data_in_leaf': 15}
Best RMSE: 2955016.5138541143


In [None]:
################# itog ########################
model_cl1 = CatBoostRegressor(iterations=382, learning_rate=0.08473905607043009, depth=7, colsample_bylevel=0.9086604158965087,
                          subsample = 0.7101117118400826, min_data_in_leaf=7, loss_function='Quantile:alpha=0.7',
                             use_best_model=True, random_seed=13)
mlf_c(model_cl1, X_train_low_c, X_test_low_c, y_train_low_c, y_test_low_c, cat_features)


Model Report
RMSE for train: 2022651.3334671925
r2 for train: 0.953799243207533, 

RMSE for test: 2866820.124187761
r2 for test: 0.9139270653296961


In [None]:
filename = 'catboost_low'
pickle.dump(model_cl1, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_low_c)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_low_c, y_pred)))

root mean squared error :  2866820.124187761


# Catboost для region

In [None]:
X_c = region.drop(columns=['price'])
y_c = region.price

X_train_region_c, X_test_region_c, y_train_region_c, y_test_region_c = train_test_split(X_c, y_c, train_size = 0.7, shuffle=True, random_state=13)
cat_features = [7]

In [None]:
model_r = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    use_best_model=True,
    random_seed=13
)
model_r.fit(
    X_train_region_c, y_train_region_c,
    cat_features=cat_features,
    eval_set=(X_test_region_c, y_test_region_c),
    verbose=False
)
print('Model is fitted: ' + str(model_r.is_fitted()))
print('Model params:')
print(model_r.get_params())

Model is fitted: True
Model params:
{'iterations': 1000, 'learning_rate': 0.1, 'loss_function': 'RMSE', 'random_seed': 13, 'use_best_model': True}


In [None]:
mlf_c(model_r, X_train_region_c, X_test_region_c, y_train_region_c, y_test_region_c, cat_features)


Model Report
RMSE for train: 460707.21664547356
r2 for train: 0.9725563829041587, 

RMSE for test: 778353.0279273406
r2 for test: 0.9290729293799344


In [None]:
def objective_cr(trial):
    params = {
        "iterations": trial.suggest_int('iterations', 300, 800),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "depth": trial.suggest_int("depth", 3, 12),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 0.9),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 15, 60),
    }

    model = CatBoostRegressor(**params,loss_function='Quantile:alpha=0.7', silent=True, random_seed=13)
    model.fit(X_train_region_c, y_train_region_c, cat_features=cat_features, eval_set=(X_test_region_c, y_test_region_c))
    predictions = model.predict(X_test_region_c)
    rmse = metrics.mean_squared_error(y_test_region_c, predictions, squared=False)
    return rmse

In [None]:
study_cr = optuna.create_study(direction='minimize')
study_cr.optimize(objective_cr, n_trials=30)

[I 2024-12-11 10:16:13,949] A new study created in memory with name: no-name-bead46c7-51d2-4315-95b9-581e8b226e85
[I 2024-12-11 10:16:15,491] Trial 0 finished with value: 1210291.2157128486 and parameters: {'iterations': 748, 'learning_rate': 0.0418901865009603, 'depth': 3, 'subsample': 0.6059974960283089, 'colsample_bylevel': 0.6240731342041167, 'min_data_in_leaf': 25}. Best is trial 0 with value: 1210291.2157128486.
[I 2024-12-11 10:16:16,999] Trial 1 finished with value: 1254052.415735851 and parameters: {'iterations': 658, 'learning_rate': 0.018690310621020063, 'depth': 4, 'subsample': 0.833011980026489, 'colsample_bylevel': 0.526594178082215, 'min_data_in_leaf': 56}. Best is trial 0 with value: 1210291.2157128486.
[I 2024-12-11 10:16:18,171] Trial 2 finished with value: 1497928.0331928143 and parameters: {'iterations': 597, 'learning_rate': 0.006326864786582099, 'depth': 3, 'subsample': 0.8415567526939907, 'colsample_bylevel': 0.5557313792817732, 'min_data_in_leaf': 39}. Best is t

In [None]:
print('Best hyperparameters:', study_cr.best_params)
print('Best RMSE:', study_cr.best_value)

Best hyperparameters: {'iterations': 642, 'learning_rate': 0.09929711100490832, 'depth': 11, 'subsample': 0.5901319897111447, 'colsample_bylevel': 0.7657985346856296, 'min_data_in_leaf': 15}
Best RMSE: 949262.0563354484


In [None]:
################# itog ########################
model_r2 = CatBoostRegressor(iterations=489, learning_rate=0.07030189179042852, depth=10, colsample_bylevel=0.8247620784036073,
                          subsample = 0.754756387433371, min_data_in_leaf=24, loss_function='Quantile:alpha=0.7',
                            use_best_model=True, random_seed=13)
mlf_c(model_r2, X_train_region_c, X_test_region_c, y_train_region_c, y_test_region_c, cat_features)


Model Report
RMSE for train: 604162.6960539407
r2 for train: 0.952804658446056, 

RMSE for test: 925287.234766578
r2 for test: 0.8997667237109657


In [None]:
filename = 'catboost_region'
pickle.dump(model_r2, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_region_c)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_region_c, y_pred)))

root mean squared error :  925287.234766578


# Lightgbm для moscow_high

In [None]:
!pip install lightgbm
!arch -arm64 brew install libomp

arch: invalid option -- 'a'
Try 'arch --help' for more information.


In [None]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
moscow_high_l = pd.get_dummies(moscow_high, columns=['renovation', 'district'], drop_first=True, dtype='int64')
X_lh = moscow_high_l.drop(columns=['price'])
y_lh = moscow_high_l.price

X_train_high_l, X_test_high_l, y_train_high_l, y_test_high_l = train_test_split(X_lh, y_lh, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_lh

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation,district_zao
314,0,5.0,3.0,108.8,15.6,3.0,3,0,0,1,0
315,0,9.0,3.0,96.9,11.1,2.0,5,0,0,1,0
316,0,6.0,3.0,63.2,11.9,40.0,41,1,0,0,1
317,0,6.0,3.0,145.0,18.6,3.0,6,1,0,0,0
318,0,9.0,3.0,103.8,15.0,6.0,6,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
13125,0,7.0,0.0,31.6,9.3,2.0,6,0,0,0,0
13132,0,5.0,0.0,32.5,6.0,4.0,42,0,0,1,1
13135,0,11.0,0.0,20.0,8.3,15.0,17,0,0,0,0
13138,0,7.0,0.0,23.2,7.0,4.0,23,0,0,0,1


In [None]:
gbm = LGBMRegressor(num_leaves=31,
                    learning_rate=0.05,
                    n_estimators=20,
                    random_state=13)
gbm.fit(X_train_high_l, y_train_high_l,
        eval_set=[(X_test_high_l, y_test_high_l)],
        eval_metric='rmse')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000433 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 689
[LightGBM] [Info] Number of data points in the train set: 3007, number of used features: 12
[LightGBM] [Info] Start training from score 106357113.690057


In [None]:
def mlf_l(alg, X_train, X_test, y_train, y_test, early_stopping_rounds=50):

  #Fit the algorithm on the data
  alg.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    eval_metric='rmse'
  )

  #Predict training set:
  dtrain_predictions = alg.predict(X_train)

  #Print model report:
  print("\nModel Report")
  print(f"RMSE for train: {metrics.root_mean_squared_error(y_train, dtrain_predictions)}")
  print(f"r2 for train: {metrics.r2_score(y_train, dtrain_predictions)}, \n")

  #Predict on testing data:
  dtest_predictions = alg.predict(X_test)
  print(f"RMSE for test: {metrics.root_mean_squared_error(y_test, dtest_predictions)}")
  print(f"r2 for test: {metrics.r2_score(y_test, dtest_predictions)}")

In [None]:
mlf_l(gbm, X_train_high_l, X_test_high_l, y_train_high_l, y_test_high_l)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 689
[LightGBM] [Info] Number of data points in the train set: 3007, number of used features: 12
[LightGBM] [Info] Start training from score 106357113.690057

Model Report
RMSE for train: 91378849.26046628
r2 for train: 0.6118481388509368, 

RMSE for test: 94698490.45665522
r2 for test: 0.6093891236685187


In [None]:
def objective_lh(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 700),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 13),
        "num_leaves": trial.suggest_int("num_leaves", 55, 90),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 3, 30),
    }

    model = LGBMRegressor(**params, objective='quantile', alpha=0.8, random_state=13)
    model.fit(X_train_high_l, y_train_high_l, eval_set=(X_test_high_l, y_test_high_l), eval_metric='rmse')
    predictions = model.predict(X_test_high_l)
    rmse = metrics.mean_squared_error(y_test_high_l, predictions, squared=False)
    return rmse

In [None]:
study_lh = optuna.create_study(direction='minimize')
study_lh.optimize(objective_lh, n_trials=30)

[I 2024-12-11 10:32:08,591] A new study created in memory with name: no-name-fa0ea380-0222-4412-b171-8e201d8efc39


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000372 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:09,051] Trial 0 finished with value: 54326434.432775326 and parameters: {'n_estimators': 350, 'learning_rate': 0.0020172165820408366, 'max_depth': 5, 'num_leaves': 87, 'subsample': 0.9749142203064184, 'colsample_bytree': 0.6499199289758831, 'min_data_in_leaf': 13}. Best is trial 0 with value: 54326434.432775326.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000403 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:09,739] Trial 1 finished with value: 34158037.35822899 and parameters: {'n_estimators': 410, 'learning_rate': 0.06665484202898861, 'max_depth': 10, 'num_leaves': 58, 'subsample': 0.7762366913513351, 'colsample_bytree': 0.96207816475981, 'min_data_in_leaf': 12}. Best is trial 1 with value: 34158037.35822899.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000385 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:10,721] Trial 2 finished with value: 32492173.39171572 and parameters: {'n_estimators': 594, 'learning_rate': 0.013291538702280531, 'max_depth': 12, 'num_leaves': 67, 'subsample': 0.9937487098616684, 'colsample_bytree': 0.6788837701400263, 'min_data_in_leaf': 5}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:11,781] Trial 3 finished with value: 35176157.77572324 and parameters: {'n_estimators': 564, 'learning_rate': 0.0072012017433819376, 'max_depth': 7, 'num_leaves': 76, 'subsample': 0.6030166137876053, 'colsample_bytree': 0.6221130312487456, 'min_data_in_leaf': 4}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:13,708] Trial 4 finished with value: 36828427.37996587 and parameters: {'n_estimators': 648, 'learning_rate': 0.0043616150213854135, 'max_depth': 11, 'num_leaves': 87, 'subsample': 0.8764148536515404, 'colsample_bytree': 0.8413595314117626, 'min_data_in_leaf': 21}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000461 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:15,401] Trial 5 finished with value: 34416279.17096976 and parameters: {'n_estimators': 686, 'learning_rate': 0.058810405556545674, 'max_depth': 8, 'num_leaves': 68, 'subsample': 0.6352720069457244, 'colsample_bytree': 0.9155626067436471, 'min_data_in_leaf': 15}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000412 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:16,157] Trial 6 finished with value: 38263697.69310861 and parameters: {'n_estimators': 471, 'learning_rate': 0.005805302065825816, 'max_depth': 6, 'num_leaves': 66, 'subsample': 0.9604173138583908, 'colsample_bytree': 0.6661772677575445, 'min_data_in_leaf': 16}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:17,277] Trial 7 finished with value: 46122972.404621564 and parameters: {'n_estimators': 639, 'learning_rate': 0.0016712078255072542, 'max_depth': 8, 'num_leaves': 83, 'subsample': 0.6234554655544685, 'colsample_bytree': 0.9020335266220474, 'min_data_in_leaf': 26}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:17,908] Trial 8 finished with value: 32838416.17894039 and parameters: {'n_estimators': 421, 'learning_rate': 0.028429042117743848, 'max_depth': 12, 'num_leaves': 59, 'subsample': 0.9817635368335431, 'colsample_bytree': 0.684941962518294, 'min_data_in_leaf': 7}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000425 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:18,558] Trial 9 finished with value: 38762447.958610296 and parameters: {'n_estimators': 472, 'learning_rate': 0.008162568053150744, 'max_depth': 5, 'num_leaves': 74, 'subsample': 0.644945007439105, 'colsample_bytree': 0.7840240145041216, 'min_data_in_leaf': 4}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000419 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:19,566] Trial 10 finished with value: 33067491.02424167 and parameters: {'n_estimators': 569, 'learning_rate': 0.01998603120825564, 'max_depth': 13, 'num_leaves': 65, 'subsample': 0.7773560243560274, 'colsample_bytree': 0.7561731737376574, 'min_data_in_leaf': 9}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:20,073] Trial 11 finished with value: 33823666.807454 and parameters: {'n_estimators': 320, 'learning_rate': 0.02189704540843544, 'max_depth': 13, 'num_leaves': 55, 'subsample': 0.8940902905104016, 'colsample_bytree': 0.704334860936583, 'min_data_in_leaf': 8}. Best is trial 2 with value: 32492173.39171572.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:20,949] Trial 12 finished with value: 31992098.76680873 and parameters: {'n_estimators': 549, 'learning_rate': 0.026792073991114974, 'max_depth': 11, 'num_leaves': 61, 'subsample': 0.9973663493564104, 'colsample_bytree': 0.7254454677834484, 'min_data_in_leaf': 3}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:21,859] Trial 13 finished with value: 32757512.86562403 and parameters: {'n_estimators': 551, 'learning_rate': 0.015472028066790358, 'max_depth': 10, 'num_leaves': 62, 'subsample': 0.8943190004656726, 'colsample_bytree': 0.7339051509022392, 'min_data_in_leaf': 3}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000639 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:23,200] Trial 14 finished with value: 34117754.789009094 and parameters: {'n_estimators': 526, 'learning_rate': 0.03839251982940526, 'max_depth': 11, 'num_leaves': 70, 'subsample': 0.8362223728154292, 'colsample_bytree': 0.8151497781128358, 'min_data_in_leaf': 23}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000457 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:26,021] Trial 15 finished with value: 35983654.65430298 and parameters: {'n_estimators': 585, 'learning_rate': 0.013489578894151937, 'max_depth': 12, 'num_leaves': 76, 'subsample': 0.9364481190834816, 'colsample_bytree': 0.6055700976536676, 'min_data_in_leaf': 30}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:27,555] Trial 16 finished with value: 38317404.21648186 and parameters: {'n_estimators': 618, 'learning_rate': 0.003030292731428035, 'max_depth': 10, 'num_leaves': 63, 'subsample': 0.7001215352236241, 'colsample_bytree': 0.7203588176428625, 'min_data_in_leaf': 10}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000451 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:29,482] Trial 17 finished with value: 33156836.28793923 and parameters: {'n_estimators': 697, 'learning_rate': 0.040257771755148934, 'max_depth': 11, 'num_leaves': 71, 'subsample': 0.9310301670276668, 'colsample_bytree': 0.7736970973057082, 'min_data_in_leaf': 19}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000393 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:30,424] Trial 18 finished with value: 32307106.54013114 and parameters: {'n_estimators': 506, 'learning_rate': 0.0964109871527755, 'max_depth': 9, 'num_leaves': 80, 'subsample': 0.9949867905836275, 'colsample_bytree': 0.8477267304380411, 'min_data_in_leaf': 6}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:31,368] Trial 19 finished with value: 32627536.05502799 and parameters: {'n_estimators': 500, 'learning_rate': 0.09959240991658852, 'max_depth': 9, 'num_leaves': 80, 'subsample': 0.8360855017203594, 'colsample_bytree': 0.8563143227464308, 'min_data_in_leaf': 7}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000484 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:32,193] Trial 20 finished with value: 34125093.98162071 and parameters: {'n_estimators': 437, 'learning_rate': 0.07944129590598785, 'max_depth': 9, 'num_leaves': 82, 'subsample': 0.7260805694474849, 'colsample_bytree': 0.8737095807542891, 'min_data_in_leaf': 12}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:33,201] Trial 21 finished with value: 32954095.368919343 and parameters: {'n_estimators': 605, 'learning_rate': 0.011939779222536587, 'max_depth': 12, 'num_leaves': 61, 'subsample': 0.9986288337535042, 'colsample_bytree': 0.8090825588143244, 'min_data_in_leaf': 5}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000424 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:34,203] Trial 22 finished with value: 33264772.46866654 and parameters: {'n_estimators': 522, 'learning_rate': 0.04153774587811105, 'max_depth': 8, 'num_leaves': 90, 'subsample': 0.9379045741928005, 'colsample_bytree': 0.741037047920235, 'min_data_in_leaf': 6}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000418 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:35,074] Trial 23 finished with value: 56960195.79822284 and parameters: {'n_estimators': 542, 'learning_rate': 0.0010479547105757428, 'max_depth': 11, 'num_leaves': 66, 'subsample': 0.999257130339718, 'colsample_bytree': 0.697001283398429, 'min_data_in_leaf': 3}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000472 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:35,852] Trial 24 finished with value: 33375046.071374465 and parameters: {'n_estimators': 498, 'learning_rate': 0.027045511939732047, 'max_depth': 10, 'num_leaves': 55, 'subsample': 0.9551194049253806, 'colsample_bytree': 0.6429211582814089, 'min_data_in_leaf': 10}. Best is trial 12 with value: 31992098.76680873.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000463 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:37,117] Trial 25 finished with value: 31607421.426193506 and parameters: {'n_estimators': 595, 'learning_rate': 0.051067566668186605, 'max_depth': 13, 'num_leaves': 78, 'subsample': 0.912565586917581, 'colsample_bytree': 0.982330538696137, 'min_data_in_leaf': 5}. Best is trial 25 with value: 31607421.426193506.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000084 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:38,491] Trial 26 finished with value: 31589136.339236822 and parameters: {'n_estimators': 642, 'learning_rate': 0.046864984610650244, 'max_depth': 13, 'num_leaves': 79, 'subsample': 0.9014847512413454, 'colsample_bytree': 0.9798119220405977, 'min_data_in_leaf': 8}. Best is trial 26 with value: 31589136.339236822.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000408 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:40,492] Trial 27 finished with value: 32649147.461744107 and parameters: {'n_estimators': 647, 'learning_rate': 0.052108226014394354, 'max_depth': 13, 'num_leaves': 77, 'subsample': 0.8559823079070205, 'colsample_bytree': 0.987458980049003, 'min_data_in_leaf': 9}. Best is trial 26 with value: 31589136.339236822.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000449 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:42,677] Trial 28 finished with value: 32878380.5844362 and parameters: {'n_estimators': 667, 'learning_rate': 0.029845969417824515, 'max_depth': 13, 'num_leaves': 73, 'subsample': 0.9057214208437635, 'colsample_bytree': 0.9361641722757978, 'min_data_in_leaf': 13}. Best is trial 26 with value: 31589136.339236822.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 122000016.000000


[I 2024-12-11 10:32:44,114] Trial 29 finished with value: 32958641.23352144 and parameters: {'n_estimators': 621, 'learning_rate': 0.05078818177602985, 'max_depth': 12, 'num_leaves': 85, 'subsample': 0.8085801784432103, 'colsample_bytree': 0.9922283959827507, 'min_data_in_leaf': 11}. Best is trial 26 with value: 31589136.339236822.




In [None]:
print('Best hyperparameters:', study_lh.best_params)
print('Best RMSE:', study_lh.best_value)

Best hyperparameters: {'n_estimators': 642, 'learning_rate': 0.046864984610650244, 'max_depth': 13, 'num_leaves': 79, 'subsample': 0.9014847512413454, 'colsample_bytree': 0.9798119220405977, 'min_data_in_leaf': 8}
Best RMSE: 31589136.339236822


In [None]:
################# itog ########################
gbm_lh = LGBMRegressor(n_estimators = 511, learning_rate=0.049641450239224616, max_depth=10, num_leaves=99,
                    subsample=0.9145013355037311, colsample_bytree=0.8215877247105206, min_data_in_leaf=5,
                      objective='quantile', alpha=0.7, random_state=13)

In [None]:
mlf_l(gbm_lh, X_train_high_l, X_test_high_l, y_train_high_l, y_test_high_l)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000413 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 674
[LightGBM] [Info] Number of data points in the train set: 2741, number of used features: 10
[LightGBM] [Info] Start training from score 85000000.000000

Model Report
RMSE for train: 13779384.642789673
r2 for train: 0.95549466458413, 

RMSE for test: 29643657.124104757
r2 for test: 0.7903327968670837


In [None]:
filename = 'lightgbm_high'
pickle.dump(gbm_lh, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_high_l)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_high_l, y_pred)))

root mean squared error :  29643657.124104757


# Lightgbm для moscow_medium

In [None]:
moscow_medium_l = pd.get_dummies(moscow_medium, columns=['renovation', 'district'], drop_first=True, dtype='int64')
X_lm = moscow_medium_l.drop(columns=['price'])
y_lm = moscow_medium_l.price

X_train_medium_l, X_test_medium_l, y_train_medium_l, y_test_medium_l = train_test_split(X_lm, y_lm, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_lm

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation,district_szao,district_uao,district_uzao
326,0,1.0,3.0,104.00,20.0,3.0,15,1,0,0,0,0,0
333,0,7.0,3.0,75.70,5.0,7.0,17,1,0,0,0,0,0
341,0,5.0,3.0,90.53,14.1,10.0,21,0,1,0,0,0,0
345,0,17.0,3.0,82.20,7.6,37.0,41,0,0,1,0,0,0
346,0,15.0,3.0,91.40,7.1,10.0,41,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22641,1,7.0,2.0,85.00,13.7,5.0,22,0,0,0,0,1,0
22642,1,7.0,2.0,79.90,13.3,6.0,22,0,0,0,0,1,0
22643,1,7.0,2.0,80.20,13.3,8.0,22,0,0,0,0,1,0
22644,1,7.0,2.0,75.70,12.9,5.0,22,0,0,0,0,1,0


In [None]:
gbm_m = LGBMRegressor(num_leaves=31,
                    learning_rate=0.05,
                    n_estimators=20,
                    random_state=13)
gbm_m.fit(X_train_medium_l, y_train_medium_l,
        eval_set=[(X_test_medium_l, y_test_medium_l)],
        eval_metric='rmse')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 3396, number of used features: 13
[LightGBM] [Info] Start training from score 28672997.079800


In [None]:
mlf_l(gbm_m, X_train_medium_l, X_test_medium_l, y_train_medium_l, y_test_medium_l)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 658
[LightGBM] [Info] Number of data points in the train set: 3396, number of used features: 13
[LightGBM] [Info] Start training from score 28672997.079800

Model Report
RMSE for train: 21119754.822563875
r2 for train: 0.7207684951098188, 

RMSE for test: 20009792.017417457
r2 for test: 0.7216120268724964


In [None]:
def objective_lm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 800, 1300),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 13),
        "num_leaves": trial.suggest_int("num_leaves", 60, 110),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 10),
    }

    model = LGBMRegressor(**params, objective='quantile', alpha=0.8, random_state=13)
    model.fit(X_train_medium_l, y_train_medium_l, eval_set=(X_test_medium_l, y_test_medium_l), eval_metric='rmse')
    predictions = model.predict(X_test_medium_l)
    rmse = metrics.mean_squared_error(y_test_medium_l, predictions, squared=False)
    return rmse

In [None]:
study_lm = optuna.create_study(direction='minimize')
study_lm.optimize(objective_lm, n_trials=30)

[I 2024-12-11 10:55:42,157] A new study created in memory with name: no-name-fa687c58-9377-437d-abc2-0c8bddbe89a8


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000563 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:43,866] Trial 0 finished with value: 11012553.210001545 and parameters: {'n_estimators': 977, 'learning_rate': 0.08161135060721446, 'max_depth': 10, 'num_leaves': 61, 'subsample': 0.9272430111364891, 'colsample_bytree': 0.7053531890263639, 'min_data_in_leaf': 2}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000681 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:46,372] Trial 1 finished with value: 15152984.041811388 and parameters: {'n_estimators': 1157, 'learning_rate': 0.0007909757125232242, 'max_depth': 9, 'num_leaves': 82, 'subsample': 0.8621250354413414, 'colsample_bytree': 0.9514076718391156, 'min_data_in_leaf': 10}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000534 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:48,579] Trial 2 finished with value: 11176837.011467526 and parameters: {'n_estimators': 1086, 'learning_rate': 0.0031404924338961117, 'max_depth': 12, 'num_leaves': 79, 'subsample': 0.7097774070164811, 'colsample_bytree': 0.7935738958729789, 'min_data_in_leaf': 1}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000589 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:51,005] Trial 3 finished with value: 12058490.074341971 and parameters: {'n_estimators': 1049, 'learning_rate': 0.02492627046169229, 'max_depth': 6, 'num_leaves': 102, 'subsample': 0.7790433635628344, 'colsample_bytree': 0.7102668972838323, 'min_data_in_leaf': 9}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:53,108] Trial 4 finished with value: 15764698.955075657 and parameters: {'n_estimators': 920, 'learning_rate': 0.0009021924613885351, 'max_depth': 6, 'num_leaves': 71, 'subsample': 0.7743834839857932, 'colsample_bytree': 0.8385083374229114, 'min_data_in_leaf': 8}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000114 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:54,392] Trial 5 finished with value: 25443327.86540732 and parameters: {'n_estimators': 825, 'learning_rate': 0.0001331296970029078, 'max_depth': 11, 'num_leaves': 64, 'subsample': 0.852667050573602, 'colsample_bytree': 0.7445738696706217, 'min_data_in_leaf': 8}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:55:57,124] Trial 6 finished with value: 12813251.807361647 and parameters: {'n_estimators': 1208, 'learning_rate': 0.0012678139234485698, 'max_depth': 10, 'num_leaves': 96, 'subsample': 0.9014920967928047, 'colsample_bytree': 0.7997944673004542, 'min_data_in_leaf': 8}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000522 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:00,806] Trial 7 finished with value: 11512840.032650014 and parameters: {'n_estimators': 1134, 'learning_rate': 0.021855511662443563, 'max_depth': 10, 'num_leaves': 80, 'subsample': 0.7306830313475077, 'colsample_bytree': 0.9351206992882788, 'min_data_in_leaf': 8}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:08,174] Trial 8 finished with value: 11237985.418639565 and parameters: {'n_estimators': 1011, 'learning_rate': 0.012697686538474183, 'max_depth': 12, 'num_leaves': 93, 'subsample': 0.8630934823380414, 'colsample_bytree': 0.8728379125913656, 'min_data_in_leaf': 7}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000652 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:14,190] Trial 9 finished with value: 11167448.851969043 and parameters: {'n_estimators': 1115, 'learning_rate': 0.03050436798487058, 'max_depth': 13, 'num_leaves': 103, 'subsample': 0.7385968879489662, 'colsample_bytree': 0.8091468237016238, 'min_data_in_leaf': 10}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:16,631] Trial 10 finished with value: 11228733.122951966 and parameters: {'n_estimators': 1298, 'learning_rate': 0.0979520128090392, 'max_depth': 9, 'num_leaves': 61, 'subsample': 0.9996814747172065, 'colsample_bytree': 0.7006960834332694, 'min_data_in_leaf': 2}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:20,644] Trial 11 finished with value: 11392991.465937864 and parameters: {'n_estimators': 958, 'learning_rate': 0.07858926988439423, 'max_depth': 13, 'num_leaves': 110, 'subsample': 0.9513820341960535, 'colsample_bytree': 0.7790460941929844, 'min_data_in_leaf': 4}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:23,352] Trial 12 finished with value: 11696298.132367384 and parameters: {'n_estimators': 886, 'learning_rate': 0.008714018925915116, 'max_depth': 8, 'num_leaves': 90, 'subsample': 0.9305415695243242, 'colsample_bytree': 0.8834713537090065, 'min_data_in_leaf': 5}. Best is trial 0 with value: 11012553.210001545.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002944 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:26,831] Trial 13 finished with value: 10891154.841327075 and parameters: {'n_estimators': 997, 'learning_rate': 0.04558854074604726, 'max_depth': 13, 'num_leaves': 110, 'subsample': 0.8009243994548775, 'colsample_bytree': 0.7497011371399147, 'min_data_in_leaf': 3}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:28,614] Trial 14 finished with value: 11449569.021709858 and parameters: {'n_estimators': 984, 'learning_rate': 0.05083027625091357, 'max_depth': 8, 'num_leaves': 71, 'subsample': 0.9989215749844973, 'colsample_bytree': 0.7528516038173606, 'min_data_in_leaf': 3}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000523 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:31,214] Trial 15 finished with value: 11025506.003754534 and parameters: {'n_estimators': 851, 'learning_rate': 0.006688113892420072, 'max_depth': 11, 'num_leaves': 110, 'subsample': 0.7926090202804823, 'colsample_bytree': 0.7358802237537008, 'min_data_in_leaf': 1}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:34,008] Trial 16 finished with value: 11066127.917311383 and parameters: {'n_estimators': 932, 'learning_rate': 0.0036245584225666674, 'max_depth': 12, 'num_leaves': 73, 'subsample': 0.812685521754005, 'colsample_bytree': 0.98449858949774, 'min_data_in_leaf': 3}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000661 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:35,770] Trial 17 finished with value: 11713415.23203031 and parameters: {'n_estimators': 1019, 'learning_rate': 0.04265779986432229, 'max_depth': 7, 'num_leaves': 89, 'subsample': 0.8927935964407766, 'colsample_bytree': 0.7553844926962806, 'min_data_in_leaf': 5}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:37,761] Trial 18 finished with value: 11237479.960141575 and parameters: {'n_estimators': 1070, 'learning_rate': 0.016914577826903934, 'max_depth': 11, 'num_leaves': 66, 'subsample': 0.8200464927873105, 'colsample_bytree': 0.7205122227455764, 'min_data_in_leaf': 3}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:40,471] Trial 19 finished with value: 21561485.9584057 and parameters: {'n_estimators': 1199, 'learning_rate': 0.0002691244785352277, 'max_depth': 13, 'num_leaves': 99, 'subsample': 0.9546070568359807, 'colsample_bytree': 0.8349039595248166, 'min_data_in_leaf': 6}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000513 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:42,066] Trial 20 finished with value: 11246238.99441489 and parameters: {'n_estimators': 885, 'learning_rate': 0.0613664970926234, 'max_depth': 8, 'num_leaves': 76, 'subsample': 0.8925104094066096, 'colsample_bytree': 0.7716078860609997, 'min_data_in_leaf': 2}. Best is trial 13 with value: 10891154.841327075.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000493 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:44,193] Trial 21 finished with value: 10852708.290675597 and parameters: {'n_estimators': 807, 'learning_rate': 0.006442852134764578, 'max_depth': 10, 'num_leaves': 109, 'subsample': 0.804126043423449, 'colsample_bytree': 0.7299712614509771, 'min_data_in_leaf': 1}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:47,149] Trial 22 finished with value: 11934364.906451251 and parameters: {'n_estimators': 803, 'learning_rate': 0.00253882809933606, 'max_depth': 10, 'num_leaves': 104, 'subsample': 0.829091912993811, 'colsample_bytree': 0.7297253916161698, 'min_data_in_leaf': 2}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000635 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:49,736] Trial 23 finished with value: 11020490.065387202 and parameters: {'n_estimators': 972, 'learning_rate': 0.007364117406270575, 'max_depth': 9, 'num_leaves': 107, 'subsample': 0.8008513188771011, 'colsample_bytree': 0.7030698400119999, 'min_data_in_leaf': 1}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000519 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:51,964] Trial 24 finished with value: 11061381.731110418 and parameters: {'n_estimators': 876, 'learning_rate': 0.033763893760010724, 'max_depth': 11, 'num_leaves': 97, 'subsample': 0.7651450078750328, 'colsample_bytree': 0.7767085164711467, 'min_data_in_leaf': 4}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000524 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:53,949] Trial 25 finished with value: 11228091.915889531 and parameters: {'n_estimators': 925, 'learning_rate': 0.013830624700434137, 'max_depth': 10, 'num_leaves': 86, 'subsample': 0.8386225097352731, 'colsample_bytree': 0.7289820435326164, 'min_data_in_leaf': 2}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:56,717] Trial 26 finished with value: 11231442.031128442 and parameters: {'n_estimators': 1032, 'learning_rate': 0.09392571338999277, 'max_depth': 12, 'num_leaves': 106, 'subsample': 0.921371096390842, 'colsample_bytree': 0.7609096190607744, 'min_data_in_leaf': 4}. Best is trial 21 with value: 10852708.290675597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000558 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:56:59,633] Trial 27 finished with value: 10639473.021825818 and parameters: {'n_estimators': 988, 'learning_rate': 0.052270480528158726, 'max_depth': 9, 'num_leaves': 100, 'subsample': 0.747371649459583, 'colsample_bytree': 0.8194621140327527, 'min_data_in_leaf': 1}. Best is trial 27 with value: 10639473.021825818.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:57:01,988] Trial 28 finished with value: 11173350.579342334 and parameters: {'n_estimators': 839, 'learning_rate': 0.010992227489785612, 'max_depth': 7, 'num_leaves': 100, 'subsample': 0.7504868246448503, 'colsample_bytree': 0.8147503873783805, 'min_data_in_leaf': 1}. Best is trial 27 with value: 10639473.021825818.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000572 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000


[I 2024-12-11 10:57:04,623] Trial 29 finished with value: 11258111.728826413 and parameters: {'n_estimators': 977, 'learning_rate': 0.004350788812355168, 'max_depth': 9, 'num_leaves': 107, 'subsample': 0.7094224231821096, 'colsample_bytree': 0.863135711505625, 'min_data_in_leaf': 3}. Best is trial 27 with value: 10639473.021825818.


In [None]:
print('Best hyperparameters:', study_lm.best_params)
print('Best RMSE:', study_lm.best_value)

Best hyperparameters: {'n_estimators': 988, 'learning_rate': 0.052270480528158726, 'max_depth': 9, 'num_leaves': 100, 'subsample': 0.747371649459583, 'colsample_bytree': 0.8194621140327527, 'min_data_in_leaf': 1}
Best RMSE: 10639473.021825818


In [None]:
################# itog ########################
gbm_lm = LGBMRegressor(n_estimators = 988, learning_rate=0.052270480528158726, max_depth=9, num_leaves=100,
                    subsample=0.747371649459583, colsample_bytree=0.8194621140327527, min_data_in_leaf=1,
                       objective='quantile', alpha=0.8, random_state=13)
mlf_l(gbm_lm, X_train_medium_l, X_test_medium_l, y_train_medium_l, y_test_medium_l)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 650
[LightGBM] [Info] Number of data points in the train set: 3376, number of used features: 13
[LightGBM] [Info] Start training from score 34000000.000000

Model Report
RMSE for train: 3587634.7691926407
r2 for train: 0.9858746073661454, 

RMSE for test: 10639473.021825818
r2 for test: 0.8392496032219563


In [None]:
filename = 'lightgbm_medium'
pickle.dump(gbm_lm, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_medium_l)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_medium_l, y_pred)))

root mean squared error :  10639473.021825818


# Lightgbm для moscow_low

In [None]:
moscow_low_l = pd.get_dummies(moscow_low, columns=['renovation', 'district'], drop_first=True, dtype='int64')
X_ll = moscow_low_l.drop(columns=['price'])
y_ll = moscow_low_l.price

X_train_low_l, X_test_low_l, y_train_low_l, y_test_low_l = train_test_split(X_ll, y_ll, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_ll

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation,district_svao,district_uvao,district_vao
322,0,11.0,3.0,92.0,20.0,4.0,21,0,0,0,1,0,0
323,0,13.0,3.0,109.0,25.0,35.0,58,1,0,0,1,0,0
325,0,7.0,3.0,83.5,12.5,16.0,22,0,0,1,1,0,0
327,0,10.0,3.0,77.1,13.5,22.0,25,0,0,1,0,0,0
371,0,7.0,3.0,90.8,12.6,2.0,30,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22267,1,13.0,2.0,32.1,3.8,15.0,11,0,0,0,0,1,0
22269,1,11.0,1.0,34.0,11.0,14.0,23,0,0,0,0,1,0
22272,1,15.0,2.0,65.6,11.5,25.0,16,0,0,0,0,1,0
22301,1,13.0,3.0,79.0,9.0,9.0,16,0,0,0,0,1,0


In [None]:
gbm_l = LGBMRegressor(num_leaves=31,
                    learning_rate=0.05,
                    n_estimators=20,
                    random_state=13)
gbm_l.fit(X_train_low_l, y_train_low_l,
        eval_set=[(X_test_low_l, y_test_low_l)],
        eval_metric='rmse')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 12580659.904022


In [None]:
mlf_l(gbm_l, X_train_low_l, X_test_low_l, y_train_low_l, y_test_low_l)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000221 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 12580659.904022

Model Report
RMSE for train: 4843361.490809972
r2 for train: 0.7350885021993419, 

RMSE for test: 5288790.251956653
r2 for test: 0.7070601808667554


In [None]:
def objective_ll(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 600, 1000),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 11),
        "num_leaves": trial.suggest_int("num_leaves", 70, 90),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 8),
    }

    model = LGBMRegressor(**params, objective='quantile', alpha=0.8, random_state=13)
    model.fit(X_train_low_l, y_train_low_l, eval_set=(X_test_low_l, y_test_low_l), eval_metric='rmse')
    predictions = model.predict(X_test_low_l)
    rmse = metrics.mean_squared_error(y_test_low_l, predictions, squared=False)
    return rmse

In [None]:
study_ll = optuna.create_study(direction='minimize')
study_ll.optimize(objective_ll, n_trials=30)

[I 2024-12-11 11:11:13,692] A new study created in memory with name: no-name-77fdb05d-3ac4-44b5-9d75-3357b0e3e991


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000770 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:15,934] Trial 0 finished with value: 3008480.817399717 and parameters: {'n_estimators': 858, 'learning_rate': 0.020380362309439883, 'max_depth': 7, 'num_leaves': 73, 'subsample': 0.7062313323991941, 'colsample_bytree': 0.7871407718986974, 'min_data_in_leaf': 5}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:17,459] Trial 1 finished with value: 3112707.0474470966 and parameters: {'n_estimators': 720, 'learning_rate': 0.007657129844634705, 'max_depth': 7, 'num_leaves': 74, 'subsample': 0.8703542148945032, 'colsample_bytree': 0.9572587664751145, 'min_data_in_leaf': 8}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:19,185] Trial 2 finished with value: 3027966.796260124 and parameters: {'n_estimators': 891, 'learning_rate': 0.01701051852127047, 'max_depth': 7, 'num_leaves': 79, 'subsample': 0.6603780072154671, 'colsample_bytree': 0.8680827758390737, 'min_data_in_leaf': 5}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:21,139] Trial 3 finished with value: 3198079.7729532295 and parameters: {'n_estimators': 798, 'learning_rate': 0.008426431115936987, 'max_depth': 6, 'num_leaves': 75, 'subsample': 0.7816898899715545, 'colsample_bytree': 0.901428901175674, 'min_data_in_leaf': 5}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000737 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:23,396] Trial 4 finished with value: 3538171.906904527 and parameters: {'n_estimators': 651, 'learning_rate': 0.00383128617128667, 'max_depth': 7, 'num_leaves': 90, 'subsample': 0.9715863991291525, 'colsample_bytree': 0.7617526347274824, 'min_data_in_leaf': 6}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:24,968] Trial 5 finished with value: 3037944.9085737695 and parameters: {'n_estimators': 771, 'learning_rate': 0.015787859348695563, 'max_depth': 9, 'num_leaves': 72, 'subsample': 0.6012918281445564, 'colsample_bytree': 0.8084949387908281, 'min_data_in_leaf': 7}. Best is trial 0 with value: 3008480.817399717.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:26,949] Trial 6 finished with value: 2965712.1741146026 and parameters: {'n_estimators': 849, 'learning_rate': 0.025015932853823763, 'max_depth': 10, 'num_leaves': 89, 'subsample': 0.9049972939150359, 'colsample_bytree': 0.9587845591591335, 'min_data_in_leaf': 4}. Best is trial 6 with value: 2965712.1741146026.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000556 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:28,194] Trial 7 finished with value: 4719095.265882698 and parameters: {'n_estimators': 636, 'learning_rate': 0.002179010965842336, 'max_depth': 11, 'num_leaves': 77, 'subsample': 0.8440513390027093, 'colsample_bytree': 0.7945206730484035, 'min_data_in_leaf': 3}. Best is trial 6 with value: 2965712.1741146026.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:29,675] Trial 8 finished with value: 2980640.906909784 and parameters: {'n_estimators': 928, 'learning_rate': 0.08316764438950211, 'max_depth': 7, 'num_leaves': 75, 'subsample': 0.9051006409172662, 'colsample_bytree': 0.8961631213990273, 'min_data_in_leaf': 8}. Best is trial 6 with value: 2965712.1741146026.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:31,516] Trial 9 finished with value: 4083055.6524564335 and parameters: {'n_estimators': 924, 'learning_rate': 0.0020158495610264927, 'max_depth': 9, 'num_leaves': 82, 'subsample': 0.9492412533729107, 'colsample_bytree': 0.9627709442393113, 'min_data_in_leaf': 8}. Best is trial 6 with value: 2965712.1741146026.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000579 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:34,293] Trial 10 finished with value: 2939635.3017602726 and parameters: {'n_estimators': 999, 'learning_rate': 0.0637533066215987, 'max_depth': 11, 'num_leaves': 90, 'subsample': 0.7971448219252075, 'colsample_bytree': 0.9774529717879259, 'min_data_in_leaf': 1}. Best is trial 10 with value: 2939635.3017602726.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:37,683] Trial 11 finished with value: 3022385.1674295245 and parameters: {'n_estimators': 997, 'learning_rate': 0.06472106985747661, 'max_depth': 11, 'num_leaves': 90, 'subsample': 0.7629128608878797, 'colsample_bytree': 0.9958417571597985, 'min_data_in_leaf': 1}. Best is trial 10 with value: 2939635.3017602726.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:40,279] Trial 12 finished with value: 2890195.1337653617 and parameters: {'n_estimators': 997, 'learning_rate': 0.03440731740544383, 'max_depth': 10, 'num_leaves': 85, 'subsample': 0.8237303046605043, 'colsample_bytree': 0.7133363183190475, 'min_data_in_leaf': 1}. Best is trial 12 with value: 2890195.1337653617.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000533 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:43,138] Trial 13 finished with value: 3046490.685673832 and parameters: {'n_estimators': 986, 'learning_rate': 0.039499510178983795, 'max_depth': 10, 'num_leaves': 84, 'subsample': 0.8187610167566611, 'colsample_bytree': 0.7142223529869477, 'min_data_in_leaf': 1}. Best is trial 12 with value: 2890195.1337653617.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:45,357] Trial 14 finished with value: 2884705.029607355 and parameters: {'n_estimators': 953, 'learning_rate': 0.041973033068122965, 'max_depth': 10, 'num_leaves': 86, 'subsample': 0.7195378696702368, 'colsample_bytree': 0.7071077149384, 'min_data_in_leaf': 2}. Best is trial 14 with value: 2884705.029607355.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:47,837] Trial 15 finished with value: 2943360.6801636424 and parameters: {'n_estimators': 943, 'learning_rate': 0.03970879499787914, 'max_depth': 10, 'num_leaves': 86, 'subsample': 0.7232861918279969, 'colsample_bytree': 0.7074451482005332, 'min_data_in_leaf': 3}. Best is trial 14 with value: 2884705.029607355.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000693 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:50,892] Trial 16 finished with value: 2880387.710042417 and parameters: {'n_estimators': 955, 'learning_rate': 0.03677679201036004, 'max_depth': 9, 'num_leaves': 85, 'subsample': 0.7058600720418606, 'colsample_bytree': 0.744578604726088, 'min_data_in_leaf': 2}. Best is trial 16 with value: 2880387.710042417.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:52,756] Trial 17 finished with value: 3035558.0797037506 and parameters: {'n_estimators': 849, 'learning_rate': 0.004887736476484722, 'max_depth': 8, 'num_leaves': 87, 'subsample': 0.6522179888017327, 'colsample_bytree': 0.7502988089132778, 'min_data_in_leaf': 3}. Best is trial 16 with value: 2880387.710042417.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000240 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:54,320] Trial 18 finished with value: 2964214.485021777 and parameters: {'n_estimators': 723, 'learning_rate': 0.09692264854219595, 'max_depth': 9, 'num_leaves': 82, 'subsample': 0.7345003666676132, 'colsample_bytree': 0.8313249950748255, 'min_data_in_leaf': 2}. Best is trial 16 with value: 2880387.710042417.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000588 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:56,144] Trial 19 finished with value: 2795304.7425341597 and parameters: {'n_estimators': 888, 'learning_rate': 0.012156668812175945, 'max_depth': 8, 'num_leaves': 70, 'subsample': 0.6732368841233553, 'colsample_bytree': 0.7548790074614093, 'min_data_in_leaf': 2}. Best is trial 19 with value: 2795304.7425341597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000578 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:11:57,976] Trial 20 finished with value: 2936901.412391935 and parameters: {'n_estimators': 904, 'learning_rate': 0.010145588812087184, 'max_depth': 8, 'num_leaves': 70, 'subsample': 0.6664285817126642, 'colsample_bytree': 0.7482272487577157, 'min_data_in_leaf': 4}. Best is trial 19 with value: 2795304.7425341597.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:00,062] Trial 21 finished with value: 2844062.160511083 and parameters: {'n_estimators': 944, 'learning_rate': 0.012470200405132586, 'max_depth': 8, 'num_leaves': 82, 'subsample': 0.6893991724988904, 'colsample_bytree': 0.7418042745514781, 'min_data_in_leaf': 2}. Best is trial 19 with value: 2795304.7425341597.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000555 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:02,787] Trial 22 finished with value: 2793649.75009527 and parameters: {'n_estimators': 877, 'learning_rate': 0.010677947774717648, 'max_depth': 8, 'num_leaves': 80, 'subsample': 0.6032484653619365, 'colsample_bytree': 0.773500846060697, 'min_data_in_leaf': 2}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000768 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:05,126] Trial 23 finished with value: 2818669.9092341354 and parameters: {'n_estimators': 879, 'learning_rate': 0.012316472558075628, 'max_depth': 8, 'num_leaves': 79, 'subsample': 0.6076936162321491, 'colsample_bytree': 0.7770802155654976, 'min_data_in_leaf': 2}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:06,886] Trial 24 finished with value: 3042634.234256461 and parameters: {'n_estimators': 862, 'learning_rate': 0.005534217226083516, 'max_depth': 8, 'num_leaves': 78, 'subsample': 0.6076032095394056, 'colsample_bytree': 0.8370298943453864, 'min_data_in_leaf': 3}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:08,517] Trial 25 finished with value: 3300635.819336968 and parameters: {'n_estimators': 816, 'learning_rate': 0.003263073167547498, 'max_depth': 8, 'num_leaves': 80, 'subsample': 0.6240837838105406, 'colsample_bytree': 0.7722823509890329, 'min_data_in_leaf': 2}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:10,251] Trial 26 finished with value: 3100050.6973321526 and parameters: {'n_estimators': 888, 'learning_rate': 0.006809969296057899, 'max_depth': 6, 'num_leaves': 70, 'subsample': 0.6422967585000123, 'colsample_bytree': 0.8189101338199216, 'min_data_in_leaf': 4}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:11,838] Trial 27 finished with value: 5233558.109932735 and parameters: {'n_estimators': 820, 'learning_rate': 0.0013233858423601723, 'max_depth': 8, 'num_leaves': 77, 'subsample': 0.6213288552009714, 'colsample_bytree': 0.8582655651801875, 'min_data_in_leaf': 3}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:13,499] Trial 28 finished with value: 2806580.9251526105 and parameters: {'n_estimators': 775, 'learning_rate': 0.01136498970354155, 'max_depth': 9, 'num_leaves': 80, 'subsample': 0.6774437806450634, 'colsample_bytree': 0.7806738848762675, 'min_data_in_leaf': 2}. Best is trial 22 with value: 2793649.75009527.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000


[I 2024-12-11 11:12:15,714] Trial 29 finished with value: 2853793.368346183 and parameters: {'n_estimators': 760, 'learning_rate': 0.024546740736930688, 'max_depth': 9, 'num_leaves': 73, 'subsample': 0.682599454401896, 'colsample_bytree': 0.793596447942576, 'min_data_in_leaf': 6}. Best is trial 22 with value: 2793649.75009527.


In [None]:
print('Best hyperparameters:', study_ll.best_params)
print('Best RMSE:', study_ll.best_value)

Best hyperparameters: {'n_estimators': 877, 'learning_rate': 0.010677947774717648, 'max_depth': 8, 'num_leaves': 80, 'subsample': 0.6032484653619365, 'colsample_bytree': 0.773500846060697, 'min_data_in_leaf': 2}
Best RMSE: 2793649.75009527


In [None]:
################# itog ########################
gbm_l = LGBMRegressor(n_estimators = 787, learning_rate=0.040453498263892836, max_depth=7, num_leaves=81,
                    subsample=0.6929403320408208, colsample_bytree=0.9367609150417758, min_data_in_leaf=2,
                      objective='quantile', alpha=0.8, random_state=13)
mlf_l(gbm_l, X_train_low_l, X_test_low_l, y_train_low_l, y_test_low_l)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000760 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 577
[LightGBM] [Info] Number of data points in the train set: 3928, number of used features: 13
[LightGBM] [Info] Start training from score 15000000.000000

Model Report
RMSE for train: 1485806.6582551189
r2 for train: 0.9750694626288353, 

RMSE for test: 2788123.8816274684
r2 for test: 0.9185877318311384


In [None]:
filename = 'lightgbm_low'
pickle.dump(gbm_l, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_low_l)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_low_l, y_pred)))

root mean squared error :  2788123.8816274684


# Lightgbm для region

In [None]:
region_l = pd.get_dummies(region, columns=['renovation'], drop_first=True, dtype='int64')
X_lr = region_l.drop(columns=['price'])
y_lr = region_l.price

X_train_region_l, X_test_region_l, y_train_region_l, y_test_region_l = train_test_split(X_lr, y_lr, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_lr

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation
0,0,6.0,1.0,30.60,8.5,25.0,25,0,0,0
1,0,2.0,1.0,49.20,10.0,6.0,15,0,1,0
2,0,14.0,1.0,44.70,13.1,10.0,25,0,0,0
3,0,8.0,1.0,35.10,11.0,12.0,33,0,1,0
4,0,6.0,1.0,37.70,4.0,5.0,5,0,0,1
...,...,...,...,...,...,...,...,...,...,...
22670,1,8.0,1.0,44.17,10.3,4.0,17,0,0,0
22672,1,25.0,1.0,31.60,12.2,11.0,15,0,0,0
22673,1,30.0,0.0,18.00,8.1,17.0,17,0,0,0
22674,1,14.0,2.0,36.39,6.6,12.0,14,0,0,0


In [None]:
gbm_r = LGBMRegressor(num_leaves=31,
                    learning_rate=0.05,
                    n_estimators=20,
                    random_state=13)
gbm_r.fit(X_train_region_l, y_train_region_l,
        eval_set=[(X_test_region_l, y_test_region_l)],
        eval_metric='rmse')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 7019104.048269


In [None]:
mlf_l(gbm_r, X_train_region_l, X_test_region_l, y_train_region_l, y_test_region_l)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 7019104.048269

Model Report
RMSE for train: 1447077.312460486
r2 for train: 0.729246056946959, 

RMSE for test: 1590988.4764482966
r2 for test: 0.7036582419970967


In [None]:
def objective_lr(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 500, 900),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 7, 15),
        "num_leaves": trial.suggest_int("num_leaves", 70, 120),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 10),
    }

    model = LGBMRegressor(**params, objective='quantile', alpha=0.8, random_state=13)
    model.fit(X_train_region_l, y_train_region_l, eval_set=(X_test_region_l, y_test_region_l), eval_metric='rmse')
    predictions = model.predict(X_test_region_l)
    rmse = metrics.mean_squared_error(y_test_region_l, predictions, squared=False)
    return rmse

In [None]:
study_lr = optuna.create_study(direction='minimize')
study_lr.optimize(objective_lr, n_trials=30)

[I 2024-12-11 11:18:48,895] A new study created in memory with name: no-name-f9cb6a3c-116a-4cb0-a128-8a8caacf0b29


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000535 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:50,022] Trial 0 finished with value: 2286145.9382904107 and parameters: {'n_estimators': 526, 'learning_rate': 0.0013571028485059808, 'max_depth': 9, 'num_leaves': 93, 'subsample': 0.7723619374063218, 'colsample_bytree': 0.8875589538153335, 'min_data_in_leaf': 7}. Best is trial 0 with value: 2286145.9382904107.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:51,303] Trial 1 finished with value: 2173524.508211178 and parameters: {'n_estimators': 681, 'learning_rate': 0.001301898020143746, 'max_depth': 13, 'num_leaves': 72, 'subsample': 0.7844260090310763, 'colsample_bytree': 0.732211071937954, 'min_data_in_leaf': 6}. Best is trial 1 with value: 2173524.508211178.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:52,403] Trial 2 finished with value: 915630.6695404957 and parameters: {'n_estimators': 526, 'learning_rate': 0.026170242468492883, 'max_depth': 12, 'num_leaves': 71, 'subsample': 0.627604990493773, 'colsample_bytree': 0.9268557796320513, 'min_data_in_leaf': 2}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000543 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:53,923] Trial 3 finished with value: 1902562.3099301937 and parameters: {'n_estimators': 775, 'learning_rate': 0.001573019573174409, 'max_depth': 8, 'num_leaves': 72, 'subsample': 0.7854181548349319, 'colsample_bytree': 0.8106833938139513, 'min_data_in_leaf': 10}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:55,307] Trial 4 finished with value: 936051.2439485551 and parameters: {'n_estimators': 545, 'learning_rate': 0.09061294325420811, 'max_depth': 12, 'num_leaves': 96, 'subsample': 0.9084060705519763, 'colsample_bytree': 0.9389774183324178, 'min_data_in_leaf': 6}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:18:57,202] Trial 5 finished with value: 1091542.5406834648 and parameters: {'n_estimators': 742, 'learning_rate': 0.012846354211787632, 'max_depth': 13, 'num_leaves': 90, 'subsample': 0.6518731630090285, 'colsample_bytree': 0.9598426358093216, 'min_data_in_leaf': 7}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000547 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:00,254] Trial 6 finished with value: 1050940.189980692 and parameters: {'n_estimators': 803, 'learning_rate': 0.05576608165621931, 'max_depth': 14, 'num_leaves': 94, 'subsample': 0.7316745822871011, 'colsample_bytree': 0.704150219940482, 'min_data_in_leaf': 10}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000829 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:02,171] Trial 7 finished with value: 1047791.6699709188 and parameters: {'n_estimators': 713, 'learning_rate': 0.010766873657581714, 'max_depth': 14, 'num_leaves': 90, 'subsample': 0.8746959243901695, 'colsample_bytree': 0.7675256200199001, 'min_data_in_leaf': 6}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000816 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:03,366] Trial 8 finished with value: 1265232.3162778984 and parameters: {'n_estimators': 587, 'learning_rate': 0.005945369276479698, 'max_depth': 8, 'num_leaves': 79, 'subsample': 0.8832606479330232, 'colsample_bytree': 0.7820748348668218, 'min_data_in_leaf': 6}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000538 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:04,878] Trial 9 finished with value: 1075967.5902308542 and parameters: {'n_estimators': 678, 'learning_rate': 0.00934859237060898, 'max_depth': 7, 'num_leaves': 88, 'subsample': 0.8703832007439724, 'colsample_bytree': 0.9205970714818283, 'min_data_in_leaf': 1}. Best is trial 2 with value: 915630.6695404957.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:07,265] Trial 10 finished with value: 872725.7174049517 and parameters: {'n_estimators': 893, 'learning_rate': 0.026451410594000878, 'max_depth': 10, 'num_leaves': 114, 'subsample': 0.6009328657906645, 'colsample_bytree': 0.8631260140727859, 'min_data_in_leaf': 1}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000500 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:09,281] Trial 11 finished with value: 897711.4056035923 and parameters: {'n_estimators': 900, 'learning_rate': 0.031094189429139833, 'max_depth': 10, 'num_leaves': 116, 'subsample': 0.6014443014888956, 'colsample_bytree': 0.9971680082492316, 'min_data_in_leaf': 1}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000521 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:11,998] Trial 12 finished with value: 928338.034129834 and parameters: {'n_estimators': 900, 'learning_rate': 0.027800976024009744, 'max_depth': 10, 'num_leaves': 118, 'subsample': 0.6073147349985527, 'colsample_bytree': 0.9970635032286138, 'min_data_in_leaf': 3}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:15,263] Trial 13 finished with value: 918376.2746545529 and parameters: {'n_estimators': 891, 'learning_rate': 0.031294327046985176, 'max_depth': 10, 'num_leaves': 119, 'subsample': 0.6940335861191165, 'colsample_bytree': 0.8567300129978231, 'min_data_in_leaf': 3}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000802 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:17,674] Trial 14 finished with value: 1029806.3778071301 and parameters: {'n_estimators': 845, 'learning_rate': 0.004855630382843549, 'max_depth': 11, 'num_leaves': 109, 'subsample': 0.9935053655984513, 'colsample_bytree': 0.8577763417236192, 'min_data_in_leaf': 1}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:19,677] Trial 15 finished with value: 898957.3401606272 and parameters: {'n_estimators': 834, 'learning_rate': 0.04495494737172743, 'max_depth': 10, 'num_leaves': 108, 'subsample': 0.6911231303578265, 'colsample_bytree': 0.8186090021684124, 'min_data_in_leaf': 4}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000546 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:22,085] Trial 16 finished with value: 928117.1959798172 and parameters: {'n_estimators': 860, 'learning_rate': 0.018746806138149844, 'max_depth': 11, 'num_leaves': 110, 'subsample': 0.6599191930777889, 'colsample_bytree': 0.997664311441908, 'min_data_in_leaf': 4}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000379 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:23,277] Trial 17 finished with value: 932493.3478140652 and parameters: {'n_estimators': 609, 'learning_rate': 0.09911562476555695, 'max_depth': 9, 'num_leaves': 101, 'subsample': 0.6064742002416149, 'colsample_bytree': 0.8852153743419284, 'min_data_in_leaf': 1}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:26,004] Trial 18 finished with value: 1336111.9505035684 and parameters: {'n_estimators': 794, 'learning_rate': 0.003063188993695191, 'max_depth': 15, 'num_leaves': 115, 'subsample': 0.7194623328773206, 'colsample_bytree': 0.9591753447559492, 'min_data_in_leaf': 2}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000806 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:28,704] Trial 19 finished with value: 935131.0202241879 and parameters: {'n_estimators': 866, 'learning_rate': 0.0506276773983546, 'max_depth': 9, 'num_leaves': 104, 'subsample': 0.6606888483849152, 'colsample_bytree': 0.8860647802060285, 'min_data_in_leaf': 4}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:30,530] Trial 20 finished with value: 977633.5967342395 and parameters: {'n_estimators': 825, 'learning_rate': 0.015971846706115296, 'max_depth': 7, 'num_leaves': 114, 'subsample': 0.834214557407301, 'colsample_bytree': 0.8188133987824179, 'min_data_in_leaf': 2}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000492 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:32,772] Trial 21 finished with value: 892978.2529466012 and parameters: {'n_estimators': 896, 'learning_rate': 0.035400612075092326, 'max_depth': 10, 'num_leaves': 107, 'subsample': 0.6930968327308429, 'colsample_bytree': 0.8244591244634336, 'min_data_in_leaf': 4}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:34,913] Trial 22 finished with value: 897529.9342286113 and parameters: {'n_estimators': 891, 'learning_rate': 0.036092813225479074, 'max_depth': 10, 'num_leaves': 102, 'subsample': 0.602115649683956, 'colsample_bytree': 0.7767540529019915, 'min_data_in_leaf': 3}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000532 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:36,879] Trial 23 finished with value: 878454.981956446 and parameters: {'n_estimators': 767, 'learning_rate': 0.061482432770584704, 'max_depth': 11, 'num_leaves': 101, 'subsample': 0.6425705757795244, 'colsample_bytree': 0.7717381259744459, 'min_data_in_leaf': 3}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000507 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:39,000] Trial 24 finished with value: 903607.0290190524 and parameters: {'n_estimators': 749, 'learning_rate': 0.06928413336840392, 'max_depth': 12, 'num_leaves': 99, 'subsample': 0.742235149315841, 'colsample_bytree': 0.748879364992116, 'min_data_in_leaf': 5}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:41,717] Trial 25 finished with value: 904024.7735324232 and parameters: {'n_estimators': 657, 'learning_rate': 0.05833949021011799, 'max_depth': 11, 'num_leaves': 106, 'subsample': 0.6889675069455009, 'colsample_bytree': 0.8366758355263726, 'min_data_in_leaf': 3}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000969 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:43,734] Trial 26 finished with value: 1018840.0773374089 and parameters: {'n_estimators': 772, 'learning_rate': 0.019742600655760858, 'max_depth': 8, 'num_leaves': 111, 'subsample': 0.639238493341455, 'colsample_bytree': 0.8005922718121588, 'min_data_in_leaf': 5}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:45,478] Trial 27 finished with value: 911581.3216281224 and parameters: {'n_estimators': 815, 'learning_rate': 0.07921238896705542, 'max_depth': 9, 'num_leaves': 113, 'subsample': 0.6760858639279871, 'colsample_bytree': 0.8502528298694827, 'min_data_in_leaf': 2}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:47,685] Trial 28 finished with value: 919197.2780628641 and parameters: {'n_estimators': 864, 'learning_rate': 0.044243804582338515, 'max_depth': 11, 'num_leaves': 105, 'subsample': 0.7112602238852423, 'colsample_bytree': 0.7938118343690709, 'min_data_in_leaf': 4}. Best is trial 10 with value: 872725.7174049517.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000528 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000


[I 2024-12-11 11:19:49,879] Trial 29 finished with value: 1052961.3479290633 and parameters: {'n_estimators': 746, 'learning_rate': 0.022325611661098067, 'max_depth': 12, 'num_leaves': 120, 'subsample': 0.7576648341588667, 'colsample_bytree': 0.7528640783692536, 'min_data_in_leaf': 9}. Best is trial 10 with value: 872725.7174049517.


In [None]:
print('Best hyperparameters:', study_lr.best_params)
print('Best RMSE:', study_lr.best_value)

Best hyperparameters: {'n_estimators': 893, 'learning_rate': 0.026451410594000878, 'max_depth': 10, 'num_leaves': 114, 'subsample': 0.6009328657906645, 'colsample_bytree': 0.8631260140727859, 'min_data_in_leaf': 1}
Best RMSE: 872725.7174049517


In [None]:
################# itog ########################
gbm_r = LGBMRegressor(n_estimators = 572, learning_rate=0.09246489912147637, max_depth=13, num_leaves=115,
                    subsample=0.7197009670544868, colsample_bytree=0.7530099102589622, min_data_in_leaf=2,
                      objective='quantile', alpha=0.8, random_state=13)
mlf_l(gbm_r, X_train_region_l, X_test_region_l, y_train_region_l, y_test_region_l)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000879 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 513
[LightGBM] [Info] Number of data points in the train set: 4247, number of used features: 10
[LightGBM] [Info] Start training from score 8714711.000000

Model Report
RMSE for train: 465337.0944511471
r2 for train: 0.9720020219699647, 

RMSE for test: 857215.9796258013
r2 for test: 0.913972104012247


In [None]:
filename = 'lightgbm_region'
pickle.dump(gbm_r, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_region_l)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_region_l, y_pred)))

root mean squared error :  857215.9796258013


# XGBoost для moscow_high

In [None]:
from matplotlib import pylab as plot
import matplotlib.pyplot as plt
%pylab inline

import xgboost as xgb
from xgboost.sklearn import XGBRegressor

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [None]:
moscow_high_x = pd.get_dummies(moscow_high, columns=['renovation', 'district'], drop_first=True, dtype='int64')

In [None]:
high_xgb = XGBRegressor(learning_rate=0.1,
                          n_estimators=150,
                          objective= 'reg:quantile',
                          alpha=0.7,
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)

In [None]:
X = moscow_high_x.drop(columns=['price'])
y = moscow_high_x.price

X_train_high, X_test_high, y_train_high, y_test_high = train_test_split(X, y, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
def modelfit(alg, X_train, X_test, y_train, y_test, early_stopping_rounds=50):

  #Fit the algorithm on the data
  alg.fit(
    X_train, y_train
  )

  #Predict training set:
  dtrain_predictions = alg.predict(X_train)

  #Print model report:
  print("\nModel Report")
  print(f"RMSE for train: {metrics.root_mean_squared_error(y_train, dtrain_predictions)}")
  print(f"r2 for train: {metrics.r2_score(y_train, dtrain_predictions)}, \n")

  #Predict on testing data:
  dtest_predictions = alg.predict(X_test)
  print(f"RMSE for test: {metrics.root_mean_squared_error(y_test, dtest_predictions)}")
  print(f"r2 for test: {metrics.r2_score(y_test, dtest_predictions)}")

In [None]:
modelfit(high_xgb, X_train_high, X_test_high, y_train_high, y_test_high)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 80166930.82377683
r2 for train: 0.7012548814888053, 

RMSE for test: 82934224.72159183
r2 for test: 0.700411045213249


In [None]:
def objective_xh(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 900, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 6, 13),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 12, 35),
        "gamma": trial.suggest_float("gamma", 1e-3, 0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 5),
    }

    model = XGBRegressor(**params, objective= 'reg:quantileerror', quantile_alpha=0.8, seed=13)
    model.fit(X_train_high, y_train_high)
    predictions = model.predict(X_test_high)
    rmse = metrics.mean_squared_error(y_test_high, predictions, squared=False)
    return rmse

In [None]:
study_xh = optuna.create_study(direction='minimize')
study_xh.optimize(objective_xh, n_trials=30)

[I 2024-12-11 12:08:22,118] A new study created in memory with name: no-name-48820485-dbda-4a88-a96f-54e17cac797c
[I 2024-12-11 12:08:27,399] Trial 0 finished with value: 37917005.191754535 and parameters: {'n_estimators': 1068, 'learning_rate': 0.056406907502952366, 'max_depth': 13, 'subsample': 0.8232316671290382, 'colsample_bytree': 0.9758758746909955, 'min_child_weight': 19, 'gamma': 0.09896458837933378, 'reg_alpha': 3.806935354569255}. Best is trial 0 with value: 37917005.191754535.
[I 2024-12-11 12:08:29,017] Trial 1 finished with value: 37759493.48960055 and parameters: {'n_estimators': 1141, 'learning_rate': 0.00612244322345043, 'max_depth': 8, 'subsample': 0.9623257735530721, 'colsample_bytree': 0.8438467079369252, 'min_child_weight': 28, 'gamma': 0.037038290178781415, 'reg_alpha': 0.5378223506713689}. Best is trial 1 with value: 37759493.48960055.
[I 2024-12-11 12:08:30,043] Trial 2 finished with value: 38903720.60522562 and parameters: {'n_estimators': 905, 'learning_rate': 

In [None]:
print('Best hyperparameters:', study_xh.best_params)
print('Best RMSE:', study_xh.best_value)

Best hyperparameters: {'n_estimators': 1025, 'learning_rate': 0.04237557420025514, 'max_depth': 12, 'subsample': 0.8035433824565256, 'colsample_bytree': 0.66269751398511, 'min_child_weight': 18, 'gamma': 0.06802170691764002, 'reg_alpha': 0.5476227463850738}
Best RMSE: 33194800.554730598


In [None]:
################# itog ########################
high_xgb = XGBRegressor(learning_rate=0.05441611087463926,
                          n_estimators=1073,
                          max_depth=9,
                          subsample = 0.9639152578524065,
                          colsample_bytree = 0.7068144966129117,
                          min_child_weight=22,
                          gamma=0.043008454856764015,
                          reg_alpha=0.5665267752449539,
                          objective= 'reg:quantileerror',
                          quantile_alpha=0.7,
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)
modelfit(high_xgb, X_train_high, X_test_high, y_train_high, y_test_high)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 20075112.91101773
r2 for train: 0.9055355446399047, 

RMSE for test: 31061735.35824868
r2 for test: 0.7697930824082315


In [None]:
filename = 'xgboost_high'
pickle.dump(high_xgb, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_high)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_high, y_pred)))

root mean squared error :  31061735.35824868


# XGBoost для moscow_medium

In [None]:
moscow_medium_x = pd.get_dummies(moscow_medium, columns=['renovation', 'district'], drop_first=True, dtype='int64')

In [None]:
medium_xgb = XGBRegressor(learning_rate=0.1,
                          n_estimators=150,
                          objective= 'reg:squarederror',
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)

In [None]:
X_medium = moscow_medium_x.drop(columns=['price'])
y_medium = moscow_medium_x.price

X_train_medium, X_test_medium, y_train_medium, y_test_medium = train_test_split(X_medium, y_medium, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
def objective_xm(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 750, 1100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 13),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 3, 15),
        "gamma": trial.suggest_float("gamma", 1e-4, 0.5),
        "reg_alpha": trial.suggest_float("reg_alpha", 1, 10),
    }

    model = XGBRegressor(**params, objective= 'reg:quantileerror', quantile_alpha=0.7,  seed=13)
    model.fit(X_train_medium, y_train_medium)
    predictions = model.predict(X_test_medium)
    rmse = metrics.mean_squared_error(y_test_medium, predictions, squared=False)
    return rmse

In [None]:
study_xm = optuna.create_study(direction='minimize')
study_xm.optimize(objective_xm, n_trials=30)

[I 2024-12-11 12:21:42,183] A new study created in memory with name: no-name-e9085f4f-e706-48ad-880b-51e83557b663
[I 2024-12-11 12:21:43,292] Trial 0 finished with value: 11688120.767250719 and parameters: {'n_estimators': 989, 'learning_rate': 0.04777945957582733, 'max_depth': 10, 'subsample': 0.7093734388393069, 'colsample_bytree': 0.919024579897937, 'min_child_weight': 10, 'gamma': 0.41198959583940614, 'reg_alpha': 4.079775211040029}. Best is trial 0 with value: 11688120.767250719.
[I 2024-12-11 12:21:44,147] Trial 1 finished with value: 12181925.004115973 and parameters: {'n_estimators': 866, 'learning_rate': 0.08672622963746336, 'max_depth': 10, 'subsample': 0.6348486040490005, 'colsample_bytree': 0.8200018753111029, 'min_child_weight': 10, 'gamma': 0.19456246526883497, 'reg_alpha': 8.9560534800139}. Best is trial 0 with value: 11688120.767250719.
[I 2024-12-11 12:21:45,296] Trial 2 finished with value: 21686960.730247702 and parameters: {'n_estimators': 946, 'learning_rate': 0.00

In [None]:
print('Best hyperparameters:', study_xm.best_params)
print('Best RMSE:', study_xm.best_value)

Best hyperparameters: {'n_estimators': 847, 'learning_rate': 0.017125662668407857, 'max_depth': 13, 'subsample': 0.824224997184994, 'colsample_bytree': 0.7706963460573254, 'min_child_weight': 4, 'gamma': 0.2913469373707511, 'reg_alpha': 1.0452863874219096}
Best RMSE: 10805201.315830743


In [None]:
################# itog ########################
medium_xgb = XGBRegressor(**study_xm.best_params,
                          objective= 'reg:quantileerror',
                          quantile_alpha=0.7,
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)
modelfit(medium_xgb, X_train_medium, X_test_medium, y_train_medium, y_test_medium)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 6894027.172829182
r2 for train: 0.9478409033035132, 

RMSE for test: 10805201.315830743
r2 for test: 0.8342026656432612


In [None]:
filename = 'xgboost_medium'
pickle.dump(medium_xgb, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_medium)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_medium, y_pred)))

root mean squared error :  10805201.315830743


# XGBoost для moscow_low

In [None]:
moscow_low_x = pd.get_dummies(moscow_low, columns=['renovation', 'district'], drop_first=True, dtype='int64')

In [None]:
moscow_low_x

Unnamed: 0,price,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation,district_svao,district_uvao,district_vao
322,52500000.0,0,11.0,3.0,92.0,20.0,4.0,21,0,0,0,1,0,0
323,38000000.0,0,13.0,3.0,109.0,25.0,35.0,58,1,0,0,1,0,0
325,22450000.0,0,7.0,3.0,83.5,12.5,16.0,22,0,0,1,1,0,0
327,17500000.0,0,10.0,3.0,77.1,13.5,22.0,25,0,0,1,0,0,0
371,33800000.0,0,7.0,3.0,90.8,12.6,2.0,30,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22267,7986480.0,1,13.0,2.0,32.1,3.8,15.0,11,0,0,0,0,1,0
22269,9530000.0,1,11.0,1.0,34.0,11.0,14.0,23,0,0,0,0,1,0
22272,15700000.0,1,15.0,2.0,65.6,11.5,25.0,16,0,0,0,0,1,0
22301,13000000.0,1,13.0,3.0,79.0,9.0,9.0,16,0,0,0,0,1,0


In [None]:
low_xgb = XGBRegressor(learning_rate=0.1,
                          n_estimators=150,
                          objective= 'reg:squarederror',
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)

In [None]:
X_low = moscow_low_x.drop(columns=['price'])
y_low = moscow_low_x.price

X_train_low, X_test_low, y_train_low, y_test_low = train_test_split(X_low, y_low, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_train_low

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation,district_svao,district_uvao,district_vao
8219,0,16.0,3.0,68.80,9.5,19.0,22,0,0,0,0,1,0
16797,1,19.0,2.0,62.41,11.8,2.0,11,0,0,0,0,1,0
1775,0,15.0,1.0,34.90,9.0,11.0,12,0,0,1,0,0,1
18231,1,13.0,1.0,40.16,10.0,13.0,16,0,0,0,0,1,0
8834,0,23.0,3.0,60.00,6.0,9.0,9,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1735,0,6.0,1.0,39.00,10.1,18.0,33,0,1,0,0,0,1
5072,0,12.0,1.0,39.00,10.0,6.0,22,0,0,0,0,1,0
12217,0,25.0,0.0,17.00,2.0,1.0,12,0,0,1,0,0,1
1519,0,13.0,0.0,28.00,9.0,5.0,10,1,0,0,1,0,0


In [None]:
def objective_xl(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 600, 1300),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 13),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.65, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "gamma": trial.suggest_float("gamma", 1e-4, 0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 5),
    }

    model = XGBRegressor(**params, objective= 'reg:quantileerror', quantile_alpha=0.7, seed=13)
    model.fit(X_train_low, y_train_low)
    predictions = model.predict(X_test_low)
    rmse = metrics.mean_squared_error(y_test_low, predictions, squared=False)
    return rmse

In [None]:
study_xl = optuna.create_study(direction='minimize')
study_xl.optimize(objective_xl, n_trials=30)

[I 2024-12-11 12:26:43,117] A new study created in memory with name: no-name-7a3e5036-06b7-42d1-ac6b-16102a4be1a2
[I 2024-12-11 12:26:44,904] Trial 0 finished with value: 8827188.22423794 and parameters: {'n_estimators': 1106, 'learning_rate': 0.0002083113972241228, 'max_depth': 7, 'subsample': 0.8782318458061835, 'colsample_bytree': 0.8899458821296686, 'min_child_weight': 19, 'gamma': 0.07896107670090872, 'reg_alpha': 4.3282300857595795}. Best is trial 0 with value: 8827188.22423794.
[I 2024-12-11 12:26:45,979] Trial 1 finished with value: 9084989.24584516 and parameters: {'n_estimators': 686, 'learning_rate': 0.00024611389696217506, 'max_depth': 13, 'subsample': 0.7576969262628398, 'colsample_bytree': 0.8217755392642958, 'min_child_weight': 14, 'gamma': 0.04216347070439773, 'reg_alpha': 4.034924011969716}. Best is trial 0 with value: 8827188.22423794.
[I 2024-12-11 12:26:47,633] Trial 2 finished with value: 7787450.460858502 and parameters: {'n_estimators': 1023, 'learning_rate': 0.0

In [None]:
print('Best hyperparameters:', study_xl.best_params)
print('Best RMSE:', study_xl.best_value)

Best hyperparameters: {'n_estimators': 953, 'learning_rate': 0.02224206149711645, 'max_depth': 7, 'subsample': 0.8882692978303469, 'colsample_bytree': 0.7744320991174012, 'min_child_weight': 1, 'gamma': 0.020333307651097664, 'reg_alpha': 2.8844848067273956}
Best RMSE: 2854155.228616525


In [None]:
################# itog ########################
low_xgb = XGBRegressor(**study_xl.best_params,
                          objective= 'reg:quantileerror',
                          quantile_alpha=0.7,
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)
modelfit(low_xgb, X_train_low, X_test_low, y_train_low, y_test_low)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 1969597.0751123421
r2 for train: 0.9561911532580758, 

RMSE for test: 2854155.228616525
r2 for test: 0.9146858829580954


In [None]:
filename = 'xgboost_low'
pickle.dump(low_xgb, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_low)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_low, y_pred)))

root mean squared error :  2854155.228616525


#XGBoost для region

In [None]:
region_r = pd.get_dummies(region, columns=['renovation'], drop_first=True, dtype='int64')

In [None]:
region_xgb = XGBRegressor(learning_rate=0.1,
                          n_estimators=150,
                          objective= 'reg:squarederror',
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)

In [None]:
X_region = region_r.drop(columns=['price'])
y_region = region_r.price

X_train_region, X_test_region, y_train_region, y_test_region = train_test_split(X_region, y_region, train_size = 0.7, shuffle=True, random_state=13)

In [None]:
X_region

Unnamed: 0,is_new,minutes,rooms,area,kit_area,floor,num_of_floors,renovation_Designer,renovation_European-style renovation,renovation_Without renovation
0,0,6.0,1.0,30.60,8.5,25.0,25,0,0,0
1,0,2.0,1.0,49.20,10.0,6.0,15,0,1,0
2,0,14.0,1.0,44.70,13.1,10.0,25,0,0,0
3,0,8.0,1.0,35.10,11.0,12.0,33,0,1,0
4,0,6.0,1.0,37.70,4.0,5.0,5,0,0,1
...,...,...,...,...,...,...,...,...,...,...
22670,1,8.0,1.0,44.17,10.3,4.0,17,0,0,0
22672,1,25.0,1.0,31.60,12.2,11.0,15,0,0,0
22673,1,30.0,0.0,18.00,8.1,17.0,17,0,0,0
22674,1,14.0,2.0,36.39,6.6,12.0,14,0,0,0


In [None]:
def objective_xr(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "subsample": trial.suggest_float("subsample", 0.4, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 25),
        "gamma": trial.suggest_float("gamma", 1e-5, 0.1),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 15),
    }

    model = XGBRegressor(**params, objective= 'reg:quantileerror', quantile_alpha=0.7, seed=13)
    model.fit(X_train_region, y_train_region)
    predictions = model.predict(X_test_region)
    rmse = metrics.mean_squared_error(y_test_region, predictions, squared=False)
    return rmse

In [None]:
study_xr = optuna.create_study(direction='minimize')
study_xr.optimize(objective_xr, n_trials=30)

[I 2024-12-11 12:29:29,343] A new study created in memory with name: no-name-daf5c580-ce0b-4611-9526-a15217e5f3d8
[I 2024-12-11 12:29:34,871] Trial 0 finished with value: 1127999.5453456293 and parameters: {'n_estimators': 1158, 'learning_rate': 0.030919705717016084, 'max_depth': 9, 'subsample': 0.8445675712778209, 'colsample_bytree': 0.9378829869205688, 'min_child_weight': 1, 'gamma': 0.07697914386482482, 'reg_alpha': 14.409914447241654}. Best is trial 0 with value: 1127999.5453456293.
[I 2024-12-11 12:29:37,844] Trial 1 finished with value: 1639976.1240772018 and parameters: {'n_estimators': 1080, 'learning_rate': 0.0020763107078374163, 'max_depth': 15, 'subsample': 0.9246193727308987, 'colsample_bytree': 0.482464845129338, 'min_child_weight': 6, 'gamma': 0.08226157755187355, 'reg_alpha': 7.198017720255522}. Best is trial 0 with value: 1127999.5453456293.
[I 2024-12-11 12:29:46,248] Trial 2 finished with value: 2794067.932817222 and parameters: {'n_estimators': 756, 'learning_rate': 

In [None]:
print('Best hyperparameters:', study_xr.best_params)
print('Best RMSE:', study_xr.best_value)

Best hyperparameters: {'n_estimators': 304, 'learning_rate': 0.04941122277172724, 'max_depth': 11, 'subsample': 0.7129840105610414, 'colsample_bytree': 0.8753435728872978, 'min_child_weight': 1, 'gamma': 0.018531035846643527, 'reg_alpha': 1.8502753441168096}
Best RMSE: 868759.3162333597


In [None]:
################# itog ########################
region_xgb = XGBRegressor(**study_xr.best_params,
                          objective= 'reg:quantileerror',
                          quantile_alpha=0.7,
                          nthread=-1,
                          scale_pos_weight=1,
                          seed=13)
modelfit(region_xgb, X_train_region, X_test_region, y_train_region, y_test_region)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 484767.28208474413
r2 for train: 0.9696150922390273, 

RMSE for test: 868759.3162333597
r2 for test: 0.9116395875361131


In [None]:
modelfit(region_xgb, X_train_region, X_test_region, y_train_region, y_test_region)

Parameters: { "scale_pos_weight" } are not used.




Model Report
RMSE for train: 484767.28208474413
r2 for train: 0.9696150922390273, 

RMSE for test: 868759.3162333597
r2 for test: 0.9116395875361131


In [None]:
filename = 'xgboost_region'
pickle.dump(region_xgb, open(filename, 'wb'))

In [None]:
load_model = pickle.load(open(filename, 'rb'))

y_pred = load_model.predict(X_test_region)
print('root mean squared error : ', np.sqrt(
    metrics.mean_squared_error(y_test_region, y_pred)))

root mean squared error :  868759.3162333597
