In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
def display_data(dataframe, list_columns=None):
    '''
    Развернутая информация по датасету с фильтрацией по списку колонок,
    по умолчнию без фильтрации,
    названия признаков упорядочены по алфавиту
    '''
    if list_columns==None:
        index_ = sorted(dataframe.columns)
    else: 
        index_ = sorted([x for x in set(dataframe.dtypes.index).intersection(set(list_columns))])
    
    df = dataframe[index_]
    d = pd.concat([df.dtypes,df.count() + df.isna().sum(),\
               round((df.isna().sum()/(df.count() + df.isna().sum()))*100,2),df.nunique(),],axis=1)
    d.columns = ['Тип', 'Общ.кол', '% пропусков','Кол-во уник.значений']
    display(d)
    
    return

In [3]:
train_val_data = pd.read_csv('prod_data_all.csv')

In [4]:
train_val_data.loc[train_val_data['year'] < 2024, 'mileage_per_year'] = train_val_data.loc[train_val_data['year'] < 2024].apply(lambda x: int(x['mileage'] / (2024 - x['year'])), axis=1)#train_val_data['mileage'] / (2024 - train_val_data['year'])
train_val_data = train_val_data.drop(['fuel_type'], axis=1)

train_val_data.loc[train_val_data['mileage_per_year'].isnull(), 'mileage_per_year'] = 0.0

display_data(train_val_data)

Unnamed: 0,Тип,Общ.кол,% пропусков,Кол-во уник.значений
body_type,int64,71989,0.0,11
engine_capacity,float64,71989,0.0,58
engine_power,int64,71989,0.0,344
mileage,int64,71989,0.0,14639
mileage_per_year,float64,71989,0.61,13677
model,int64,71989,0.0,29
model_2,int64,71989,0.0,962
price,int64,71989,0.0,5119
region,int64,71989,0.0,447
transmission,int64,71989,0.0,4


In [6]:
data = train_test_split(train_val_data, test_size=0.1, shuffle=True, random_state=42)
data

[          price  region  mileage  engine_capacity  transmission  engine_power  \
 36251   2249900     134        0              1.5             2           113   
 8466     550000     182    15400              1.6             3           115   
 11562   3500000     143    49000              2.0             0           249   
 71171   3700000     181   165512              1.1             3            50   
 28399  19350000     148        1              3.5             1           273   
 ...         ...     ...      ...              ...           ...           ...   
 37194   2949900     299        0              1.5             2           147   
 6265    1189000     262   101569              1.5             1           106   
 54886    895000     264    30000              1.6             3            90   
 860     1129000     358   148996              4.2             1           350   
 15795   3959990     284        0              2.0             1           200   
 
        body_t

In [8]:
display_data(data[0])

Unnamed: 0,Тип,Общ.кол,% пропусков,Кол-во уник.значений
body_type,int64,64790,0.0,11
engine_capacity,float64,64790,0.0,58
engine_power,int64,64790,0.0,340
mileage,int64,64790,0.0,13467
mileage_per_year,float64,64790,0.0,12962
model,int64,64790,0.0,29
model_2,int64,64790,0.0,941
price,int64,64790,0.0,4849
region,int64,64790,0.0,432
transmission,int64,64790,0.0,4


In [37]:
data[0]['price']

8480      4850
13126     6990
58915     2100
9195     10600
32631     7500
         ...  
54343     1990
38158    12490
860      11290
15795    39599
56422     1000
Name: price, Length: 53176, dtype: int64

In [8]:
data[0]['price'] = data[0]['price'].apply(lambda x: int(x / 100))
data[1]['price'] = data[1]['price'].apply(lambda x: int(x / 100))

x_train = data[0].drop(['price'], axis=1)
y_train = data[0]['price']

x_test = data[1].drop(['price'], axis=1)
y_test = data[1]['price']

In [82]:
y_train

36251     22499
8466       5500
11562     35000
71171     37000
28399    193500
          ...  
37194     29499
6265      11890
54886      8950
860       11290
15795     39599
Name: price, Length: 64790, dtype: int64

In [9]:
model = CatBoostRegressor(iterations = 9000,
                          random_seed = 42,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          learning_rate=0.1,
                          #depth=10
                         )

grid = {'learning_rate': [0.1, 0.2],
        'depth': [6],
        'l2_leaf_reg': [1, 3]}

model.fit(#grid,
         x_train, np.log(y_train),
         #cat_features=['body_type', 'fuel_type', 'model', 'model_2', 'region', 'transmission', 'wheel'],     #cat_features_ids,
         eval_set=(x_test, np.log(y_test)),
         verbose_eval=0,
         use_best_model=True,
         plot=True
         )

model.save_model('catboost_single_model3_baseline.model')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [10]:
def mape(y_true, y_pred):
    '''
    Метрика
    '''
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [11]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,year,34.506881
1,engine_power,21.969129
2,model,9.879726
3,model_2,8.275234
4,engine_capacity,5.5832
5,mileage,4.965778
6,region,3.690987
7,mileage_per_year,3.415147
8,body_type,2.837118
9,transmission,2.642758


In [12]:
predict = np.exp(model.predict(x_test))
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict))*100:0.2f}%")

Точность модели по метрике MAPE: 14.55%


In [13]:
from sklearn.metrics import r2_score

r2_score(predict, y_test)

0.9533849861095913

In [40]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=2000, max_depth=50, random_state=42, n_jobs=5, criterion='squared_error')
model.fit(x_train, y_train)

preds = model.predict(x_test)

print(f"Точность модели по метрике MAPE: {(mape(y_test, preds))*100:0.2f}%")
#print(r2_score(preds,y_test))

Точность модели по метрике MAPE: 16.60%


In [37]:
model

AttributeError: 'RandomForestRegressor' object has no attribute 'weight'

In [61]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=3000, random_state=42, n_jobs=5, learning_rate=0.15, max_depth=30)
model.fit(x_train, np.log(y_train), eval_set=(x_test, np.log(y_test)), )

preds = np.exp(model.predict(x_test))

print(f"Точность модели по метрике MAPE: {(mape(y_test, preds))*100:0.2f}%")
#print(r2_score(preds,y_test))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065029 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1335
[LightGBM] [Info] Number of data points in the train set: 64790, number of used features: 11
[LightGBM] [Info] Start training from score 9.404313
Точность модели по метрике MAPE: 14.98%


In [56]:
model.feature_importances_

array([17845, 16934, 10184,  3434, 20697,  6232,  2810, 13225, 20621,
       23245, 14773])

In [10]:
def mape(y_true, y_pred):
    '''
    Метрика
    '''
    return np.mean(np.abs((y_pred-y_true)/y_true))

In [18]:
from sklearn.metrics import r2_score

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

N_THREADS = 8 # 8 процессоров
N_FOLDS = 5 # 5 параллельных задач
RANDOM_STATE = 42 # Зафиксируем рандом
#TEST_SIZE = 0.2
TIMEOUT = 1200 # Время на обсчет

#df_train = pd.read_csv('../input/titanic/train.csv')
#df_test = pd.read_csv('../input/titanic/test.csv')

automl = TabularAutoML(
    task = Task(
        name = 'reg',
        loss = 'mape'),
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE}
        #metric = lambda y_true, y_pred: r2_score(y_true, y_pred))
)
oof_pred = automl.fit_predict(
    data[0],
    roles = {'target': 'price'}
)
test_pred = automl.predict(data[1])

#pd.DataFrame({
#    'PassengerId':df_test.PassengerId,
#    'price': (test_pred.data[:, 0] > 0.5)*1
#}).to_csv('submit.csv', index = False)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
