In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/used-car-dataset-ford-and-mercedes/cclass.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/unclean cclass.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/focus.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/audi.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/toyota.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/skoda.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/ford.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/vauxhall.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/bmw.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/hyundi.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/unclean focus.csv
/kaggle/input/used-car-dataset-ford-and-mercedes/merc.csv


In [5]:
# get the VW data set
df = pd.read_csv('/kaggle/input/used-car-dataset-ford-and-mercedes/vw.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,T-Roc,2019,25000,Automatic,13904,Diesel,145,49.6,2.0
1,T-Roc,2019,26883,Automatic,4562,Diesel,145,49.6,2.0
2,T-Roc,2019,20000,Manual,7414,Diesel,145,50.4,2.0
3,T-Roc,2019,33492,Automatic,4825,Petrol,145,32.5,2.0
4,T-Roc,2019,22900,Semi-Auto,6500,Petrol,150,39.8,1.5


In [6]:
y = df.pop('price')
y

0        25000
1        26883
2        20000
3        33492
4        22900
         ...  
15152     5990
15153     1799
15154     1590
15155     1250
15156     2295
Name: price, Length: 15157, dtype: int64

In [7]:
low_card_cols = [col for col in df.columns if df[col].dtype == 'object' and df[col].nunique() < 15]
low_card_cols

['transmission', 'fuelType']

In [8]:
df.nunique()

model              27
year               21
transmission        3
mileage         10760
fuelType            4
tax                32
mpg                86
engineSize         12
dtype: int64

In [9]:
df.isna().sum()

model           0
year            0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

In [10]:
df.eq(0).sum()

model             0
year              0
transmission      0
mileage           0
fuelType          0
tax             590
mpg               0
engineSize       15
dtype: int64

In [11]:
numerical_cols = [col for col in df.columns if df[col].dtype in ('int64', 'float64')]
numerical_cols

['year', 'mileage', 'tax', 'mpg', 'engineSize']

In [12]:
df.describe()

Unnamed: 0,year,mileage,tax,mpg,engineSize
count,15157.0,15157.0,15157.0,15157.0,15157.0
mean,2017.255789,22092.785644,112.744277,53.753355,1.600693
std,2.053059,21148.941635,63.482617,13.642182,0.461695
min,2000.0,1.0,0.0,0.3,0.0
25%,2016.0,5962.0,30.0,46.3,1.2
50%,2017.0,16393.0,145.0,53.3,1.6
75%,2019.0,31824.0,145.0,60.1,2.0
max,2020.0,212000.0,580.0,188.3,3.2


In [13]:
df['engineSize'] = df['engineSize'].replace({0: np.nan})
df['engineSize'].isna().sum()

15

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, cross_val_score

median_imputer = SimpleImputer(strategy="median")
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
ordinal_encoder = OrdinalEncoder()
preprocessor = ColumnTransformer(transformers = [
    ('imputer', median_imputer, ['engineSize']),
    ('OH encoder', oh_encoder, low_card_cols),
    ('label encoder', ordinal_encoder, ['model'])
])

In [15]:
def score_model(X, y, model):
#     print(preprocessor)
    model_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    scores = -1 * cross_val_score(model_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
    return scores.mean()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=0)

In [32]:
from sklearn.model_selection import RandomizedSearchCV
def random_search_cv(X, y, param_dist, model):
    model_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    random_cv = RandomizedSearchCV(estimator=model_pipeline, param_distributions=param_dist,
                                    scoring='neg_mean_absolute_error', cv=3, n_jobs=-1, n_iter=10, verbose=True)
    random_cv.fit(X, y)
    best_score = random_cv.best_score_
    best_params = random_cv.best_params_
    return (best_score, best_params)

In [33]:
from sklearn.model_selection import GridSearchCV
def grid_search_cv(X, y, param_grid, model):
    model_pipeline = Pipeline(steps = [
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    grid_cv = GridSearchCV(estimator=model_pipeline, param_grid=param_grid,
                                    scoring='neg_mean_absolute_error', cv=3, n_jobs=-1)
    grid_cv.fit(X, y)
    best_score = random_cv.best_score_
    best_params = random_cv.best_params_
    return (best_score, best_params)

In [34]:
from xgboost import XGBRegressor
# get a baseline score of the model
score_model(X_train, y_train, XGBRegressor(random_state=0))

2858.1427588947727

- Note: In a pipeline, label encoder doesn't work well; use ordinal encoder

In [27]:
# apply grid search 
param_grid = {
    'model__learning_rate': [0.02, 0.05],
    'model__n_estimators': [500, 700],
    'model__max_depth': [3, 7],
    'model__min_child_weight': [3, 7],
    'model__subsample': [0.65, 0.85],
    'model__colsample_bytree': [0.65, 0.85]
}

grid_score, grid_params = grid_search_cv(X_train, y_train, param_grid, XGBRegressor(random_state=0))

KeyboardInterrupt: 

In [None]:
print(f'Score using grid search Cv: {grid_score}')
print(f'Optimal params: {grid_params}')

In [None]:
# apply random search
param_dist = {
    'model__learning_rate': [0.01, 0.05],
    'model__n_estimators': [600, 1000],

}
# 'model__max_depth': np.arange(3, 8, 2)
# 'model__gamma': np.arange(0.5, 2, 0.5),
# 'model__min_child_weight': np.arange(1, 10, 4)
# 'model__subsample': np.arange(0.5, 1, 0.25),
# 'model__colsample_bytree': np.arange(0.5, 1, 0.25)

random_score, random_params = random_search_cv(X_train, y_train, param_dist, XGBRegressor())

In [45]:
print(random_score)
print(random_params)

-2869.4131674882974
{'model__n_estimators': 1000, 'model__max_depth': 7, 'model__learning_rate': 0.01}


In [47]:
# use optuna
import optuna

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1), 
        'n_estimators': trial.suggest_int('n_estimators', 1000, 3000),
        'min_child_weight': trial.suggest_int("min_child_weight", 1, 10),
        'colsample_bytree': trial.suggest_float("colsample_bytree", 0.5, 1.0),
        'subsample': trial.suggest_float("subsample", 0.5, 1.0),  
        'gamma': trial.suggest_float('gamma', 0.5, 2.5)
    }
    xgb = XGBRegressor(**params)
    return score_model(X_train, y_train, xgb)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
optuna_best_params = study.best_params

[32m[I 2021-03-14 04:15:45,897][0m A new study created in memory with name: no-name-e31cc1de-f09e-4aab-a0ee-90ed2a24ea02[0m
[32m[I 2021-03-14 04:16:08,167][0m Trial 0 finished with value: 2866.6029736932187 and parameters: {'max_depth': 5, 'learning_rate': 0.07391696035463802, 'n_estimators': 2462, 'min_child_weight': 7, 'colsample_bytree': 0.5209120853726217, 'subsample': 0.9333460916476669, 'gamma': 2.241748909778903}. Best is trial 0 with value: 2866.6029736932187.[0m
[32m[I 2021-03-14 04:17:04,240][0m Trial 1 finished with value: 2858.49418629314 and parameters: {'max_depth': 8, 'learning_rate': 0.032967432160213, 'n_estimators': 2897, 'min_child_weight': 9, 'colsample_bytree': 0.8718031996272682, 'subsample': 0.8993987250854418, 'gamma': 2.185265726834846}. Best is trial 1 with value: 2858.49418629314.[0m
[32m[I 2021-03-14 04:17:25,166][0m Trial 2 finished with value: 2871.538642225757 and parameters: {'max_depth': 4, 'learning_rate': 0.056255532229018954, 'n_estimators

In [48]:
print(optuna_best_params)

{'max_depth': 10, 'learning_rate': 0.045108242018806054, 'n_estimators': 2933, 'min_child_weight': 5, 'colsample_bytree': 0.8600213599092646, 'subsample': 0.6623920072281189, 'gamma': 1.5373246981205115}


In [None]:
{'max_depth': 10, 'learning_rate': 0.041684248967400896, 'n_estimators': 977, 'min_child_weight': 5, 'colsample_bytree': 0.9383346162722763, 'subsample': 0.5772852233436826}