<a href="https://colab.research.google.com/github/napchick/project/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Задачей является по описанию машины предсказать ее стоимость, датасет взят отсюда: https://www.kaggle.com/datasets/deepcontractor/car-price-prediction-challenge

In [None]:
import pandas as pd
import numpy as np

In [35]:
data = pd.read_csv('car_price_prediction.csv', delimiter=',')

In [36]:
data

Unnamed: 0,ID,Price,Levy,Manufacturer,Model,Prod. year,Category,Leather interior,Fuel type,Engine volume,Mileage,Cylinders,Gear box type,Drive wheels,Doors,Wheel,Color,Airbags
0,45654403,13328,1399,LEXUS,RX 450,2010,Jeep,Yes,Hybrid,3.5,186005 km,6.0,Automatic,4x4,04-May,Left wheel,Silver,12
1,44731507,16621,1018,CHEVROLET,Equinox,2011,Jeep,No,Petrol,3,192000 km,6.0,Tiptronic,4x4,04-May,Left wheel,Black,8
2,45774419,8467,-,HONDA,FIT,2006,Hatchback,No,Petrol,1.3,200000 km,4.0,Variator,Front,04-May,Right-hand drive,Black,2
3,45769185,3607,862,FORD,Escape,2011,Jeep,Yes,Hybrid,2.5,168966 km,4.0,Automatic,4x4,04-May,Left wheel,White,0
4,45809263,11726,446,HONDA,FIT,2014,Hatchback,Yes,Petrol,1.3,91901 km,4.0,Automatic,Front,04-May,Left wheel,Silver,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19232,45798355,8467,-,MERCEDES-BENZ,CLK 200,1999,Coupe,Yes,CNG,2.0 Turbo,300000 km,4.0,Manual,Rear,02-Mar,Left wheel,Silver,5
19233,45778856,15681,831,HYUNDAI,Sonata,2011,Sedan,Yes,Petrol,2.4,161600 km,4.0,Tiptronic,Front,04-May,Left wheel,Red,8
19234,45804997,26108,836,HYUNDAI,Tucson,2010,Jeep,Yes,Diesel,2,116365 km,4.0,Automatic,Front,04-May,Left wheel,Grey,4
19235,45793526,5331,1288,CHEVROLET,Captiva,2007,Jeep,Yes,Diesel,2,51258 km,4.0,Automatic,Front,04-May,Left wheel,Black,4


## 1. EDA

### 1.1 Изучение данных

In [37]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19237 entries, 0 to 19236
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                19237 non-null  int64  
 1   Price             19237 non-null  int64  
 2   Levy              19237 non-null  object 
 3   Manufacturer      19237 non-null  object 
 4   Model             19237 non-null  object 
 5   Prod. year        19237 non-null  int64  
 6   Category          19237 non-null  object 
 7   Leather interior  19237 non-null  object 
 8   Fuel type         19237 non-null  object 
 9   Engine volume     19237 non-null  object 
 10  Mileage           19237 non-null  object 
 11  Cylinders         19237 non-null  float64
 12  Gear box type     19237 non-null  object 
 13  Drive wheels      19237 non-null  object 
 14  Doors             19237 non-null  object 
 15  Wheel             19237 non-null  object 
 16  Color             19237 non-null  object

#### Как мы видим в датасете нет пропусков, имеются бесполезные столбцы, а также большинство признаков имеют тип данных object, их нужно привести к числовому типу

#### Признак ID не вносит никакой полезной информации для нас, поэтому его стоит удалить

In [38]:
data.drop(columns=['ID'], inplace=True)

### 1.2 Преобразование столбцов к числовому типу

#### 1) Сначала преобразуем столбцы, где основную информацию несут цифры(engine volume, mileage doors), путем удаления лишних слов

In [39]:
def vol(value):
    return float(value[:3])
data['Engine volume'] = data['Engine volume'].apply(vol)

In [40]:
def mil(value):
    return float(value[:-3])
data['Mileage'] = data['Mileage'].apply(mil)

In [41]:
def do(value):
    return int(value[1])
data['Doors'] = data['Doors'].apply(do)

In [42]:
def lev(value):
    if value == '-':
        value = 0
    return int(value)
data['Levy'] = data['Levy'].apply(lev)

In [43]:
#data['Price'] += data['Levy']

In [44]:
#data.drop(columns=['Levy'], inplace=True)

#### 2) Далее поработаем с столбцами, где данные придется кодировать

In [45]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['Leather interior'] = le.fit_transform(data['Leather interior'])

#### 3) Теперь же мы будем работать с one hot encoder

 #### Однако у таких столбцов, как "Manufacturer" и "Model" есть большое число уникальных значений, то в связке с one hot кодированием будет большое число столбцов, что не очень хорошо для хорошей работы моделей, поэтому возьмем только самые популярные элементы

In [46]:
#возьмем те марки, у которых есть хотя бы 30 машин
manufact_list = data['Manufacturer'].value_counts()[:29].index
manufact_list

Index(['HYUNDAI', 'TOYOTA', 'MERCEDES-BENZ', 'FORD', 'CHEVROLET', 'BMW',
       'LEXUS', 'HONDA', 'NISSAN', 'VOLKSWAGEN', 'SSANGYONG', 'KIA', 'OPEL',
       'MITSUBISHI', 'SUBARU', 'AUDI', 'MAZDA', 'JEEP', 'DAEWOO', 'DODGE',
       'FIAT', 'SUZUKI', 'PORSCHE', 'LAND ROVER', 'VAZ', 'MINI', 'JAGUAR',
       'RENAULT', 'INFINITI'],
      dtype='object', name='Manufacturer')

In [47]:
# возьмем модели, которые есть хотя бы в 50 экземплярах
model_list = data['Model'].value_counts()[:64].index
model_list

Unnamed: 0_level_0,count
Model,Unnamed: 1_level_1
Prius,1083
Sonata,1079
Camry,938
Elantra,922
E 350,542
...,...
GLE 350,55
Accent,54
Outlander,54
C-MAX,53


In [48]:
def man(value):
    if value in manufact_list:
        return value
    else:
        return '-'
data['Manufacturer'] = data['Manufacturer'].apply(man)

In [49]:
def mod(value):
    if value in model_list:
        return value
    else:
        return '-'
data['Model'] = data['Model'].apply(mod)

In [50]:
# выкинем строки с не популярными model или manufacturer
data = data[(data['Model'] != '-') & (data['Manufacturer'] != '-')]

#### Для большего удобства разделим нашу таблицу на 2: с числовыми признаками, а также с категориальными

In [51]:
mask = (data.dtypes == object).values
cat_features = data[data.columns[mask]]
real_features = data[data.columns[~mask]]

In [77]:
cat_features_cod = pd.get_dummies(cat_features, drop_first=True, dtype=int)

In [78]:
data_go = pd.concat([cat_features_cod, real_features], axis=1)

In [79]:
data_go

Unnamed: 0,Manufacturer_BMW,Manufacturer_CHEVROLET,Manufacturer_DAEWOO,Manufacturer_FORD,Manufacturer_HONDA,Manufacturer_HYUNDAI,Manufacturer_KIA,Manufacturer_LEXUS,Manufacturer_MERCEDES-BENZ,Manufacturer_MITSUBISHI,...,Color_Yellow,Price,Levy,Prod. year,Leather interior,Engine volume,Mileage,Cylinders,Doors,Airbags
0,0,0,0,0,0,0,0,1,0,0,...,0,13328,1399,2010,1,3.5,186005.0,6.0,4,12
2,0,0,0,0,1,0,0,0,0,0,...,0,8467,0,2006,0,1.3,200000.0,4.0,4,2
3,0,0,0,1,0,0,0,0,0,0,...,0,3607,862,2011,1,2.5,168966.0,4.0,4,0
4,0,0,0,0,1,0,0,0,0,0,...,0,11726,446,2014,1,1.3,91901.0,4.0,4,4
5,0,0,0,0,0,1,0,0,0,0,...,0,39493,891,2016,1,2.0,160931.0,4.0,4,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19231,0,0,0,0,0,0,0,0,1,0,...,0,5802,1055,2013,1,3.5,107800.0,6.0,4,12
19233,0,0,0,0,0,1,0,0,0,0,...,0,15681,831,2011,1,2.4,161600.0,4.0,4,8
19234,0,0,0,0,0,1,0,0,0,0,...,0,26108,836,2010,1,2.0,116365.0,4.0,4,4
19235,0,1,0,0,0,0,0,0,0,0,...,0,5331,1288,2007,1,2.0,51258.0,4.0,4,4


### 1.3 Нормирование данных

#### Для начала разделим наш датасет на X и y, где y это сумма price и levy(налог)

In [80]:
y = data_go['Price'] + data_go['Levy']
X = data_go.drop(columns=['Price', 'Levy'])

In [81]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X = pd.DataFrame(data=sc.fit_transform(X), columns = X.columns)

In [82]:
X

Unnamed: 0,Manufacturer_BMW,Manufacturer_CHEVROLET,Manufacturer_DAEWOO,Manufacturer_FORD,Manufacturer_HONDA,Manufacturer_HYUNDAI,Manufacturer_KIA,Manufacturer_LEXUS,Manufacturer_MERCEDES-BENZ,Manufacturer_MITSUBISHI,...,Color_Sky blue,Color_White,Color_Yellow,Prod. year,Leather interior,Engine volume,Mileage,Cylinders,Doors,Airbags
0,-0.193908,-0.254583,-0.079782,-0.250918,-0.253588,-0.600186,-0.132829,4.070297,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,-0.422606,0.548606,1.613653,-0.027342,1.497696,0.098715,1.296527
1,-0.193908,-0.254583,-0.079782,-0.250918,3.943411,-0.600186,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,-1.319259,-1.822803,-1.166200,-0.027046,-0.425531,0.098715,-1.007964
2,-0.193908,-0.254583,-0.079782,3.985369,-0.253588,-0.600186,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,1.735408,-0.075414,-0.198443,0.548606,0.350084,-0.027702,-0.425531,0.098715,-1.468862
3,-0.193908,-0.254583,-0.079782,-0.250918,3.943411,-0.600186,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,0.474047,0.548606,-1.166200,-0.029330,-0.425531,0.098715,-0.547066
4,-0.193908,-0.254583,-0.079782,-0.250918,-0.253588,1.666151,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,1.735408,-0.075414,0.922374,0.548606,-0.281701,-0.027871,-0.425531,0.098715,-0.547066
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13434,-0.193908,-0.254583,-0.079782,-0.250918,-0.253588,-0.600186,-0.132829,-0.245682,3.298849,-0.063517,...,-0.073392,-0.576233,-0.075414,0.249884,0.548606,1.613653,-0.028994,1.497696,0.098715,1.296527
13435,-0.193908,-0.254583,-0.079782,-0.250918,-0.253588,1.666151,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,-0.198443,0.548606,0.223727,-0.027857,-0.425531,0.098715,0.374731
13436,-0.193908,-0.254583,-0.079782,-0.250918,-0.253588,1.666151,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,-0.422606,0.548606,-0.281701,-0.028813,-0.425531,0.098715,-0.547066
13437,-0.193908,3.927989,-0.079782,-0.250918,-0.253588,-0.600186,-0.132829,-0.245682,-0.303136,-0.063517,...,-0.073392,-0.576233,-0.075414,-1.095096,0.548606,-0.281701,-0.030188,-0.425531,0.098715,-0.547066


## 2. Построение моделей

### 2.1 Baseline

#### В качестве baseline возьмем самую обычную линейную регрессию

In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13)

In [85]:
lr = LinearRegression().fit(X_train, y_train)
print(f"MSE train: {mean_squared_error(y_train, lr.predict(X_train))}")
print(f"MSE test: {mean_squared_error(y_test, lr.predict(X_test))}")

MSE train: 131087964.0868834
MSE test: 1.2282271325132342e+30


### 2.2 Linear regression

#### Но в этот раз поработает с параметрами

In [86]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

import warnings

warnings.filterwarnings('ignore')

In [87]:
pipe = Pipeline(steps=[
    ('variance', VarianceThreshold(0.01)),
    ('selection', SelectFromModel(Ridge(5.0))),
    ('regressor', Ridge(5.0))
])

pipe.fit(X_train, y_train)

In [88]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'variance__threshold': [0.0005],
    'selection__estimator__alpha': [0.653, 0.655, 0.658],
    'regressor__alpha': [2.278, 2.28, 2.283]
}
grid_search = GridSearchCV(pipe, param_grid, cv=5)

grid_search.fit(X_train, y_train)

In [89]:
param_best = grid_search.best_estimator_

In [90]:
param_best.named_steps

{'variance': VarianceThreshold(threshold=0.0005),
 'selection': SelectFromModel(estimator=Ridge(alpha=0.653)),
 'regressor': Ridge(alpha=2.283)}

In [91]:
print(f"MSE train: {mean_squared_error(y_train, param_best.predict(X_train))}")
print(f"MSE test: {mean_squared_error(y_test, param_best.predict(X_test))}")

MSE train: 136722344.637947
MSE test: 132890852.62866607


#### Как мы видим, ошибка на тесте упала, но это еще не предел

### 2.3 Random Forest

In [92]:
from sklearn.ensemble import RandomForestRegressor

In [93]:
rf = RandomForestRegressor().fit(X_train, y_train)

print(f"MSE train: {mean_squared_error(y_train, rf.predict(X_train))}")
print(f"MSE test: {mean_squared_error(y_test, rf.predict(X_test))}")

MSE train: 10819055.140585242
MSE test: 41848897.96834167


In [129]:
param_grid = {
    'n_estimators': [700, 800, 900],
    'max_depth': [45, 50, 55],
    'min_samples_split': [2],
    'max_features': ['sqrt']
}
grid_search = GridSearchCV(rf, param_grid, cv=5)

grid_search.fit(X_train, y_train)

In [130]:
param_best = grid_search.best_estimator_

In [131]:
grid_search.best_params_

{'max_depth': 50,
 'max_features': 'sqrt',
 'min_samples_split': 2,
 'n_estimators': 1000}

In [132]:
print(f"MSE train: {mean_squared_error(y_train, param_best.predict(X_train))}")
print(f"MSE test: {mean_squared_error(y_test, param_best.predict(X_test))}")

MSE train: 11310114.657842156
MSE test: 44164127.51634092


### 2.4 Gradient boosting

In [148]:
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

In [149]:
model_xgb = XGBRegressor(learning_rate = 0.1,
                         n_estimators = 150,
                         booster='gbtree',
                         importance_type='gain',
                         n_jobs=-1,
                         objective='reg:linear',
                         seed=27)

In [150]:
model_xgb.fit(X_train, y_train)

print(f"train loss: {mean_squared_error(y_train, model_xgb.predict(X_train))}")
print(f"test loss: {mean_squared_error(y_test, model_xgb.predict(X_test))}")

train loss: 34407248.868463516
test loss: 46968833.40669185


In [151]:
param_test1 = {
    'max_depth':[2, 5, 7, 10],
    'min_child_weight':[1e-12, 1e-11, 1e-10]
}

gsearch1 = GridSearchCV(estimator=model_xgb,
                        param_grid=param_test1,
                        scoring='neg_mean_squared_error',
                        n_jobs=-1,
                        cv=5)

gsearch1.fit(X_train, y_train)

gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 10, 'min_child_weight': 1e-12}, -46968126.65340796)

In [152]:
model_xgb.max_depth = 10
model_xgb.min_child_weight = 1e-12

In [160]:
param_test2 = {
    'gamma' : [4, 4.1, 4.2]
}

gsearch2 = GridSearchCV(estimator=model_xgb,
                        param_grid=param_test2,
                        scoring='neg_mean_squared_error',
                        n_jobs=-1,
                        cv=5)

gsearch2.fit(X_train, y_train)

gsearch2.best_params_, gsearch2.best_score_

({'gamma': 4.1}, -46968126.59259272)

In [161]:
model_xgb.gamma = 4.1

In [164]:
param_test3 = {
    'subsample' : [0.76, 0.8, 0.84],
    'colsample_bytree' : [0.86, 0.9, 0.94]
}

gsearch3 = GridSearchCV(estimator=model_xgb,
                        param_grid = param_test3,
                        scoring='neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 5)

gsearch3.fit(X_train, y_train)

gsearch3.best_params_, gsearch3.best_score_

({'colsample_bytree': 0.9, 'subsample': 0.8}, -46628962.34735502)

In [166]:
model_xgb.subsample = 0.8
model_xgb.colsample_bytree = 0.9

In [170]:
param_test4 = {
    'reg_alpha' : [0.02, 0.03, 0.04]
}

gsearch4 = GridSearchCV(estimator=model_xgb,
                        param_grid = param_test4,
                        scoring='neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 5)

gsearch4.fit(X_train, y_train)

gsearch4.best_params_, gsearch4.best_score_

({'reg_alpha': 0.03}, -46628954.01268518)

In [171]:
model_xgb.reg_alpha = 0.03

In [177]:
param_test5 = {
    'learning_rate' : [0.02, 0.03, 0.04],
    'n_estimators' : [1000, 1100, 1200]
}

gsearch5 = GridSearchCV(estimator=model_xgb,
                        param_grid = param_test5,
                        scoring='neg_mean_squared_error',
                        n_jobs = -1,
                        cv = 5)

gsearch5.fit(X_train, y_train)

gsearch5.best_params_, gsearch5.best_score_

({'learning_rate': 0.03, 'n_estimators': 1100}, -45935418.69979252)