In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.report.report_deco import ReportDeco
from lightautoml.tasks import Task

In [3]:
def mape(y_true, y_pred):
    '''
    Метрика
    '''
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [4]:
train_val_data = pd.read_csv('../datasets/prod_data_all.csv')
data = train_test_split(train_val_data, test_size=0.1, shuffle=True, random_state=42)

data

[          price  region  mileage  engine_capacity  transmission  engine_power  \
 36251   2249900     134        0              1.5             2           113   
 8466     550000     182    15400              1.6             3           115   
 11562   3500000     143    49000              2.0             0           249   
 71171   3700000     181   165512              1.1             3            50   
 28399  19350000     148        1              3.5             1           273   
 ...         ...     ...      ...              ...           ...           ...   
 37194   2949900     299        0              1.5             2           147   
 6265    1189000     262   101569              1.5             1           106   
 54886    895000     264    30000              1.6             3            90   
 860     1129000     358   148996              4.2             1           350   
 15795   3959990     284        0              2.0             1           200   
 
        body_t

In [5]:
data[0]['price'] = data[0]['price'].apply(lambda x: int(x / 100))
data[1]['price'] = data[1]['price'].apply(lambda x: int(x / 100))
data

[        price  region  mileage  engine_capacity  transmission  engine_power  \
 36251   22499     134        0              1.5             2           113   
 8466     5500     182    15400              1.6             3           115   
 11562   35000     143    49000              2.0             0           249   
 71171   37000     181   165512              1.1             3            50   
 28399  193500     148        1              3.5             1           273   
 ...       ...     ...      ...              ...           ...           ...   
 37194   29499     299        0              1.5             2           147   
 6265    11890     262   101569              1.5             1           106   
 54886    8950     264    30000              1.6             3            90   
 860     11290     358   148996              4.2             1           350   
 15795   39599     284        0              2.0             1           200   
 
        body_type  wheel  fuel_type  m

In [9]:
data[0].price = np.log(data[0].price)
data[1].price = np.log(data[1].price)

In [10]:
N_THREADS = 8 # 8 процессоров
N_FOLDS = 5 # 5 параллельных задач
RANDOM_STATE = 42
#TEST_SIZE = 0.2
TIMEOUT = 1200 # Время на обсчет

RD = ReportDeco(output_path = './automl_output/')

automl = TabularAutoML(
    task = Task(
        name = 'reg',
        #loss = 'mape'),
        metric = 'r2'),
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
)

automl_rd = RD(automl)

oof_pred = automl_rd.fit_predict(
    data[0],
    roles = {'target': 'price'},
    verbose = 1
)
test_pred = automl.predict(data[1])

[13:16:23] Stdout logging level is INFO.
[13:16:23] Task: reg

[13:16:23] Start automl preset with listed constraints:
[13:16:23] - time: 1200.00 seconds
[13:16:23] - CPU: 8 cores
[13:16:23] - memory: 16 GB

[13:16:23] [1mTrain data shape: (64790, 12)[0m

[13:16:33] Layer [1m1[0m train process start. Time left 1189.48 secs
[13:16:37] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[13:16:45] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.9025277164899618[0m
[13:16:45] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed
[13:16:45] Time left 1177.53 secs

[13:16:51] [1mSelector_LightGBM[0m fitting and predicting completed
[13:16:55] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
[13:17:41] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9378814329628131[0m
[13:17:41] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed
[13:17:41] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_1_Tun


`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(data["y_pred"] - data["y_true"], shade=True, color="m", ax=ax)


In [12]:
x_test = data[1].drop(['price'], axis=1)
test_pred = automl.predict(data[1])

In [20]:
#test_pred
y_test = data[1]['price']
print(f"Точность модели по метрике MAPE: {mape(np.exp(y_test), np.exp(test_pred.data[:,0]))*100:0.2f}%")
print(f"Точность модели по метрике R2: {r2_score(np.exp(y_test), np.exp(test_pred.data[:,0]))}")

Точность модели по метрике MAPE: 14.78%
Точность модели по метрике R2: 0.9433116639791317


In [16]:
test_pred

array([[10.630323],
       [ 8.732985],
       [10.075878],
       ...,
       [10.562755],
       [10.807248],
       [ 9.769814]], dtype=float32)

In [21]:
np.exp(data[1].price)

5488     42399.0
42181     7200.0
20623    20000.0
71722     1999.0
47586    68990.0
          ...   
63549    19500.0
43300    11490.0
70419    35490.0
59736    44500.0
2181     18990.0
Name: price, Length: 7199, dtype: float64