In [11]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
from sklearn.metrics import r2_score
import seaborn as sns
import matplotlib as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
def display_data(dataframe, list_columns=None):
    '''
    Развернутая информация по датасету с фильтрацией по списку колонок,
    по умолчнию без фильтрации,
    названия признаков упорядочены по алфавиту
    '''
    if list_columns==None:
        index_ = sorted(dataframe.columns)
    else: 
        index_ = sorted([x for x in set(dataframe.dtypes.index).intersection(set(list_columns))])
    
    df = dataframe[index_]
    d = pd.concat([df.dtypes,df.count() + df.isna().sum(),\
               round((df.isna().sum()/(df.count() + df.isna().sum()))*100,2),df.nunique(),],axis=1)
    d.columns = ['Тип', 'Общ.кол', '% пропусков','Кол-во уник.значений']
    display(d)
    
    return

In [3]:
train_val_data = pd.read_csv('../datasets/prod_data_all.csv')

In [4]:
#feature engineering: mileage per year
train_val_data.loc[train_val_data['year'] < 2024, 'mileage_per_year'] = train_val_data.loc[train_val_data['year'] < 2024].apply(lambda x: int(x['mileage'] / (2024 - x['year'])), axis=1)#train_val_data['mileage'] / (2024 - train_val_data['year'])
train_val_data = train_val_data.drop(['fuel_type'], axis=1)

train_val_data.loc[train_val_data['mileage_per_year'].isnull(), 'mileage_per_year'] = 0.0

display_data(train_val_data)

Unnamed: 0,Тип,Общ.кол,% пропусков,Кол-во уник.значений
body_type,int64,71989,0.0,11
engine_capacity,float64,71989,0.0,58
engine_power,int64,71989,0.0,344
mileage,int64,71989,0.0,14639
mileage_per_year,float64,71989,0.0,13677
model,int64,71989,0.0,29
model_2,int64,71989,0.0,962
price,int64,71989,0.0,5119
region,int64,71989,0.0,447
transmission,int64,71989,0.0,4


In [5]:
data = train_test_split(train_val_data, test_size=0.1, shuffle=True, random_state=42)
data

[          price  region  mileage  engine_capacity  transmission  engine_power  \
 36251   2249900     134        0              1.5             2           113   
 8466     550000     182    15400              1.6             3           115   
 11562   3500000     143    49000              2.0             0           249   
 71171   3700000     181   165512              1.1             3            50   
 28399  19350000     148        1              3.5             1           273   
 ...         ...     ...      ...              ...           ...           ...   
 37194   2949900     299        0              1.5             2           147   
 6265    1189000     262   101569              1.5             1           106   
 54886    895000     264    30000              1.6             3            90   
 860     1129000     358   148996              4.2             1           350   
 15795   3959990     284        0              2.0             1           200   
 
        body_t

In [6]:
display_data(data[0])

Unnamed: 0,Тип,Общ.кол,% пропусков,Кол-во уник.значений
body_type,int64,64790,0.0,11
engine_capacity,float64,64790,0.0,58
engine_power,int64,64790,0.0,340
mileage,int64,64790,0.0,13467
mileage_per_year,float64,64790,0.0,12962
model,int64,64790,0.0,29
model_2,int64,64790,0.0,941
price,int64,64790,0.0,4849
region,int64,64790,0.0,432
transmission,int64,64790,0.0,4


In [7]:
data[0]['price']

36251     2249900
8466       550000
11562     3500000
71171     3700000
28399    19350000
           ...   
37194     2949900
6265      1189000
54886      895000
860       1129000
15795     3959990
Name: price, Length: 64790, dtype: int64

In [8]:
data[0]['price'] = data[0]['price'].apply(lambda x: int(x / 100))
data[1]['price'] = data[1]['price'].apply(lambda x: int(x / 100))

x_train = data[0].drop(['price'], axis=1)
y_train = data[0]['price']

x_test = data[1].drop(['price'], axis=1)
y_test = data[1]['price']

In [9]:
y_train

36251     22499
8466       5500
11562     35000
71171     37000
28399    193500
          ...  
37194     29499
6265      11890
54886      8950
860       11290
15795     39599
Name: price, Length: 64790, dtype: int64

In [9]:
def mape(y_true, y_pred):
    '''
    Метрика
    '''
    return np.mean(np.abs((y_pred-y_true)/y_true))

CATBOOST

In [15]:
model = CatBoostRegressor(iterations = 6000,
                          random_seed = 42,
                          eval_metric='MAPE',
                          custom_metric=['R2', 'MAE'],
                          silent=True,
                          learning_rate=0.1,
                          #depth=10
                         )

grid = {'learning_rate': [0.1, 0.2],
        'depth': [6],
        'l2_leaf_reg': [1, 3]}

model.fit(#grid,
         x_train, np.log(y_train),
         #cat_features=['body_type', 'fuel_type', 'model', 'model_2', 'region', 'transmission', 'wheel'],     #cat_features_ids,
         eval_set=(x_test, np.log(y_test)),
         verbose_eval=0,
         use_best_model=True,
         plot=True
         )

model.save_model('catboost_single_model3_baseline.model')

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [16]:
model.get_feature_importance(prettified=True)

Unnamed: 0,Feature Id,Importances
0,year,34.914921
1,engine_power,22.133737
2,model,9.938206
3,model_2,8.200309
4,engine_capacity,5.59945
5,mileage,4.775454
6,region,3.495322
7,mileage_per_year,3.220013
8,body_type,2.819148
9,transmission,2.654979


In [17]:
predict = np.exp(model.predict(x_test))
print(f"Точность модели по метрике MAPE: {(mape(y_test, predict))*100:0.2f}%")

Точность модели по метрике MAPE: 14.58%


In [18]:

print(f"Точность модели по метрике R2: {r2_score(predict, y_test)}")

Точность модели по метрике R2: 0.9523142067719793


RANDOM FOREST

In [19]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=2000, max_depth=50, random_state=42, n_jobs=5, criterion='squared_error')
model.fit(x_train, np.log(y_train))

preds = np.exp(model.predict(x_test))

print(f"Точность модели по метрике MAPE: {(mape(y_test, preds)) * 100:0.2f}%")
print(r2_score(preds, y_test))

Точность модели по метрике MAPE: 15.01%
0.9348858089150659


LIGHTGBM

In [13]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(n_estimators=2000, random_state=42, n_jobs=5, learning_rate=0.1, max_depth=30)
model.fit(x_train, np.log(y_train), eval_set=(x_test, np.log(y_test)), )

preds = np.exp(model.predict(x_test))

print(f"Точность модели по метрике MAPE: {(mape(y_test, preds))*100:0.2f}%")
print(f"Точность модели по метрике R2: {r2_score(preds, y_test)}")
#print(r2_score(preds,y_test))

[1]	valid_0's l2: 0.746815
[2]	valid_0's l2: 0.627139
[3]	valid_0's l2: 0.5299
[4]	valid_0's l2: 0.450739
[5]	valid_0's l2: 0.386049
[6]	valid_0's l2: 0.332846
[7]	valid_0's l2: 0.289447
[8]	valid_0's l2: 0.253591
[9]	valid_0's l2: 0.224075
[10]	valid_0's l2: 0.199907
[11]	valid_0's l2: 0.179807
[12]	valid_0's l2: 0.163106
[13]	valid_0's l2: 0.149539
[14]	valid_0's l2: 0.137997
[15]	valid_0's l2: 0.128719
[16]	valid_0's l2: 0.120715
[17]	valid_0's l2: 0.113856
[18]	valid_0's l2: 0.108018
[19]	valid_0's l2: 0.103156
[20]	valid_0's l2: 0.0991736
[21]	valid_0's l2: 0.0954553
[22]	valid_0's l2: 0.0921385
[23]	valid_0's l2: 0.0894868
[24]	valid_0's l2: 0.0870889
[25]	valid_0's l2: 0.085096
[26]	valid_0's l2: 0.0828253
[27]	valid_0's l2: 0.0810882
[28]	valid_0's l2: 0.0796201
[29]	valid_0's l2: 0.0782022
[30]	valid_0's l2: 0.0769016
[31]	valid_0's l2: 0.0758167
[32]	valid_0's l2: 0.0746146
[33]	valid_0's l2: 0.0736624
[34]	valid_0's l2: 0.0727853
[35]	valid_0's l2: 0.0721357
[36]	valid_0's l

In [14]:
model.feature_importances_

array([8785, 9428, 3194, 1177, 7209, 2244,  818, 3700, 6792, 7776, 8877])