# Определение стоимости автомобилей

Сервис по продаже автомобилей с пробегом «Не бит, не крашен» разрабатывает приложение для привлечения новых клиентов. В нём можно быстро узнать рыночную стоимость своего автомобиля. В вашем распоряжении исторические данные: технические характеристики, комплектации и цены автомобилей. Вам нужно построить модель для определения стоимости. 

Заказчику важны:

- качество предсказания;
- скорость предсказания;
- время обучения.

## Подготовка данных

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, GridSearchCV

In [2]:
data = pd.read_csv('/datasets/autos.csv')

In [3]:
data.head()

Unnamed: 0,DateCrawled,Price,VehicleType,RegistrationYear,Gearbox,Power,Model,Kilometer,RegistrationMonth,FuelType,Brand,Repaired,DateCreated,NumberOfPictures,PostalCode,LastSeen
0,2016-03-24 11:52:17,480,,1993,manual,0,golf,150000,0,petrol,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,18300,coupe,2011,manual,190,,125000,5,gasoline,audi,yes,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,9800,suv,2004,auto,163,grand,125000,8,gasoline,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,1500,small,2001,manual,75,golf,150000,6,petrol,volkswagen,no,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,3600,small,2008,manual,69,fabia,90000,7,gasoline,skoda,no,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [4]:
data.isna().sum()

DateCrawled              0
Price                    0
VehicleType          37490
RegistrationYear         0
Gearbox              19833
Power                    0
Model                19705
Kilometer                0
RegistrationMonth        0
FuelType             32895
Brand                    0
Repaired             71154
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [5]:
data.shape

(354369, 16)

In [6]:
data.loc[data['Gearbox'] == 'manual', 'Gearbox'] = 0
data.loc[data['Gearbox'] == 'auto', 'Gearbox'] = 1

In [7]:
data.loc[data['Repaired'] == 'no', 'Repaired'] = 0
data.loc[data['Repaired'] == 'yes', 'Repaired'] = 1

In [8]:
data['VehicleType'] = data['VehicleType'].fillna('other')
data['Model'] = data['Model'].fillna('other')
data['FuelType'] = data['FuelType'].fillna('other')

In [9]:
data = data.dropna(subset=['Gearbox'])

In [10]:
data.isna().sum()

DateCrawled              0
Price                    0
VehicleType              0
RegistrationYear         0
Gearbox                  0
Power                    0
Model                    0
Kilometer                0
RegistrationMonth        0
FuelType                 0
Brand                    0
Repaired             58366
DateCreated              0
NumberOfPictures         0
PostalCode               0
LastSeen                 0
dtype: int64

In [11]:
data.shape

(334536, 16)

In [12]:
data['FuelType'].unique()

array(['petrol', 'gasoline', 'other', 'lpg', 'hybrid', 'cng', 'electric'],
      dtype=object)

In [13]:
100 - 334536 * 100 / 354369

5.596708515699731

Удалила пропущенные данные в столбце Gearbox, так как их всего 5.6%. Со столбцом Repaired я так поступить не могу, слишком много данных удаляется.

In [14]:
data.loc[data['FuelType'] == 'petrol', 'FuelType'] = 0
data.loc[data['FuelType'] == 'gasoline', 'FuelType'] = 1
data.loc[data['FuelType'] == 'other', 'FuelType'] = 2
data.loc[data['FuelType'] == 'lpg', 'FuelType'] = 3
data.loc[data['FuelType'] == 'hybrid', 'FuelType'] = 4
data.loc[data['FuelType'] == 'cng', 'FuelType'] = 5
data.loc[data['FuelType'] == 'electric', 'FuelType'] = 6

In [15]:
data['Gearbox'] = data['Gearbox'].astype('int')
data['FuelType'] = data['FuelType'].astype('int')

In [16]:
data.describe()

Unnamed: 0,Price,RegistrationYear,Gearbox,Power,Kilometer,RegistrationMonth,FuelType,NumberOfPictures,PostalCode
count,334536.0,334536.0,334536.0,334536.0,334536.0,334536.0,334536.0,334536.0,334536.0
mean,4547.697291,2003.379122,0.19814,114.457544,128777.590454,5.857955,0.487329,0.0,50637.655367
std,4551.13138,32.059005,0.398599,190.479544,36894.507459,3.655664,0.7314,0.0,25802.916894
min,0.0,1000.0,0.0,0.0,5000.0,0.0,0.0,0.0,1067.0
25%,1150.0,1999.0,0.0,75.0,125000.0,3.0,0.0,0.0,30177.0
50%,2850.0,2003.0,0.0,105.0,150000.0,6.0,0.0,0.0,49497.0
75%,6500.0,2008.0,0.0,143.0,150000.0,9.0,1.0,0.0,71254.0
max,20000.0,9000.0,1.0,20000.0,150000.0,12.0,6.0,0.0,99998.0


In [17]:
data = data.drop(data[data['RegistrationYear'] > 2023].index)

In [18]:
data = data.drop(data[data['Power'] > 5000].index)

Год регистрации не может быть больше 2023 и мощность должна быть <= 5000.

Строк, где цена = 0, слишком много, чтобы их удалить, поэтому я оставила их.

In [19]:
data.corr()

Unnamed: 0,Price,RegistrationYear,Gearbox,Power,Kilometer,RegistrationMonth,FuelType,NumberOfPictures,PostalCode
Price,1.0,0.351746,0.250608,0.390335,-0.366668,0.088735,0.096707,,0.074617
RegistrationYear,0.351746,1.0,0.026402,0.06523,-0.186819,0.034011,0.164566,,0.020696
Gearbox,0.250608,0.026402,1.0,0.293106,0.014151,0.036263,0.098259,,-0.033619
Power,0.390335,0.06523,0.293106,1.0,0.056603,0.072144,0.070067,,0.055901
Kilometer,-0.366668,-0.186819,0.014151,0.056603,1.0,-0.013608,0.11928,,-0.010765
RegistrationMonth,0.088735,0.034011,0.036263,0.072144,-0.013608,1.0,-0.019424,,0.01078
FuelType,0.096707,0.164566,0.098259,0.070067,0.11928,-0.019424,1.0,,-0.018301
NumberOfPictures,,,,,,,,,
PostalCode,0.074617,0.020696,-0.033619,0.055901,-0.010765,0.01078,-0.018301,,1.0


In [20]:
features = data.drop(['DateCrawled', 'RegistrationMonth',
                      'DateCreated', 'Price', 'NumberOfPictures', 'PostalCode', 'LastSeen', 
                      'Model','VehicleType', 'Brand', 'Repaired'], axis=1)
target = data['Price']

Выводы:

корреляция не прослеживается.

## Обучение моделей

In [21]:
features_train, features_test, target_train, target_test = train_test_split(features,
                                                                            target, test_size=0.25, random_state=12345)

In [22]:
def rmse(target, predict):
    return mean_squared_error(target, predict) ** 0.5

In [34]:
scorer = make_scorer(rmse, greater_is_better=False)
model = LinearRegression()
grid = GridSearchCV(model, param_grid={}, scoring=scorer, cv=5)
grid.fit(features_train, target_train)
grid.best_score_ * (-1)

3562.908141456513

In [38]:
grid.cv_results_

{'mean_fit_time': array([0.12922978]),
 'std_fit_time': array([0.06183053]),
 'mean_score_time': array([0.01938109]),
 'std_score_time': array([0.03087159]),
 'params': [{}],
 'split0_test_score': array([-3589.652156]),
 'split1_test_score': array([-3531.10307743]),
 'split2_test_score': array([-3556.60292863]),
 'split3_test_score': array([-3606.62880393]),
 'split4_test_score': array([-3530.55374129]),
 'mean_test_score': array([-3562.90814146]),
 'std_test_score': array([30.74040877]),
 'rank_test_score': array([1], dtype=int32)}

In [39]:
%%time
model = DecisionTreeRegressor()
grid = GridSearchCV(model, param_grid={'max_depth': range(1, 20)}, scoring=scorer, cv=5)
grid.fit(features_train, target_train)
grid.best_score_ * (-1)
# 17.3 s

CPU times: user 22.6 s, sys: 25.8 ms, total: 22.7 s
Wall time: 22.7 s


2208.35990959034

Лучшая модель дерева регресии имеет результат: 2207.49

In [40]:
grid.cv_results_

{'mean_fit_time': array([0.03718996, 0.06726832, 0.09131351, 0.1249712 , 0.14763784,
        0.16385121, 0.18523612, 0.21240544, 0.22977781, 0.23916302,
        0.25421667, 0.28639932, 0.2936831 , 0.286836  , 0.32017751,
        0.35579052, 0.34186134, 0.32370362, 0.32554326]),
 'std_fit_time': array([0.00501991, 0.00622823, 0.00327654, 0.01474509, 0.0085809 ,
        0.00907378, 0.00548451, 0.01551734, 0.01803348, 0.01627313,
        0.02088969, 0.01681691, 0.01694281, 0.01909549, 0.02153679,
        0.03233381, 0.02023629, 0.02682089, 0.02465439]),
 'mean_score_time': array([0.00404773, 0.0038208 , 0.00411162, 0.00463023, 0.00491724,
        0.00593691, 0.00569139, 0.00734453, 0.0073247 , 0.00882215,
        0.00919113, 0.00911942, 0.01142793, 0.01409464, 0.01361971,
        0.01404366, 0.01496854, 0.01480136, 0.0157371 ]),
 'std_score_time': array([1.27050118e-03, 1.38780225e-04, 7.93663328e-05, 5.87273418e-04,
        5.13398648e-04, 8.79502443e-04, 6.16229734e-04, 1.83967332e-03,


In [41]:
%%time
model = RandomForestRegressor()
grid = GridSearchCV(model, param_grid={'max_depth': range(1, 20), 'n_estimators': range(10, 31, 10)}, scoring=scorer, cv=5)
grid.fit(features_train, target_train)
grid.best_score_ * (-1)

# Wall time: 11min 47s

CPU times: user 14min 54s, sys: 3.45 s, total: 14min 57s
Wall time: 14min 58s


2109.981819742049

Лучшая модель леса регресии имеет результат: 2110.01

In [42]:
grid.cv_results_

{'mean_fit_time': array([0.29805031, 0.55899377, 0.7775322 , 0.46118107, 0.8778235 ,
        1.33637662, 0.702846  , 1.35640101, 1.85881162, 0.81604352,
        1.52237334, 2.3085001 , 0.92382212, 1.87213626, 2.94466987,
        1.08596559, 2.3156311 , 3.41001258, 1.38552651, 2.72869029,
        3.90361137, 1.37741075, 2.71362686, 4.30421414, 1.51939173,
        3.02533512, 4.55540962, 1.73432207, 3.28522072, 4.99341254,
        1.76796384, 3.50775166, 5.29170356, 1.86494293, 3.67754102,
        5.29337192, 1.84656096, 3.74961581, 5.57170644, 1.85667658,
        3.89242177, 6.28685832, 2.20321012, 4.44688816, 6.67049942,
        2.12833166, 4.10861549, 6.2551693 , 2.19400206, 4.43688974,
        6.89171219, 2.24671168, 4.50953932, 6.62711868, 2.26434584,
        4.50625467, 6.50531726]),
 'std_fit_time': array([0.01200489, 0.03532564, 0.03514864, 0.01300522, 0.01903046,
        0.0264543 , 0.05518911, 0.05953746, 0.05887352, 0.0469221 ,
        0.0303466 , 0.06684702, 0.01010245, 0.071

## Анализ моделей

In [52]:
pd.DataFrame({'model': ['LinearRegression', 'DecisionTreeRegressor', 'RandomForestRegressor'],
              'mean_fit_time': [0.12922978, 0.22563293, 3.00973794],
             'mean_score_time':[0.01938109, 0.00913951, 0.11674635]})

Unnamed: 0,model,mean_fit_time,mean_score_time
0,LinearRegression,0.12923,0.019381
1,DecisionTreeRegressor,0.225633,0.00914
2,RandomForestRegressor,3.009738,0.116746


## Тестирование лучшей модели

In [48]:
model = RandomForestRegressor(n_estimators=grid.best_params_['n_estimators'],
                              max_depth=grid.best_params_['max_depth'], random_state=12345)
model.fit(features_train, target_train)
predictions = model.predict(features_test)
rmse(target_test, predictions)

2102.754573687746

In [49]:
%%time

train_dataset = lgb.Dataset(features_train, target_train)
test_dataset = lgb.Dataset(features_test, target_test)

booster = lgb.train({"objective": "regression"},
                    train_set=train_dataset, valid_sets=(test_dataset,),
                    num_boost_round=80)
# 17 min

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 361
[LightGBM] [Info] Number of data points in the train set: 250824, number of used features: 5
[LightGBM] [Info] Start training from score 4543.647354
[1]	valid_0's l2: 1.79923e+07
[2]	valid_0's l2: 1.57381e+07
[3]	valid_0's l2: 1.39044e+07
[4]	valid_0's l2: 1.23826e+07
[5]	valid_0's l2: 1.11629e+07
[6]	valid_0's l2: 1.01354e+07
[7]	valid_0's l2: 9.30042e+06
[8]	valid_0's l2: 8.61248e+06
[9]	valid_0's l2: 8.0356e+06
[10]	valid_0's l2: 7.56435e+06
[11]	valid_0's l2: 7.17456e+06
[12]	valid_0's l2: 6.8338e+06
[13]	valid_0's l2: 6.54998e+06
[14]	valid_0's l2: 6.31497e+06
[15]	valid_0's l2: 6.11167e+06
[16]	valid_0's l2: 5.94226e+06
[17]	valid_0's l2: 5.79883e+06
[18]	valid_0's l2: 5.67799e+06
[19]	valid_0's l2: 5.56872e+06
[20]	valid_0's l2: 5.47555e+06
[21]	valid_0's l2: 5.39808e+06
[22]	valid_0's l2: 5.33478e+06
[23]	valid_0's l2: 5.27763e+06
[24]	valid_0's l2: 5.22886e+06
[25]	valid_0's l2: 5.18018e

In [50]:
predictions = booster.predict(features_test)
rmse = mean_squared_error(target_test, predictions) ** 0.5
rmse

2155.267624887491

***Выводы:***

Модель леса дает наилучшую метрику: 2102.75.

LightGBM дает наилучший результат: 2155.26, но и времени для него требуется намного больше.

Поэтому модель леса лучше всего справляется с данной задачей.