In [20]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np # linear algebra
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import preprocessing

In [21]:
train = pd.read_csv('../predict_diamonds/diamonds_train.csv')

In [22]:
predict = pd.read_csv('../predict_diamonds/diamonds_test.csv')

In [23]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [24]:
FEATS = NUM_FEATURES + CAT_FEATURES

In [303]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                ('scaler', StandardScaler())])

In [304]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('encoder', OneHotEncoder(handle_unknown='ignore'))])

In [305]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATURES),
                                ('cat', categorical_transformer, CAT_FEATURES)])

In [306]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [315]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor(boosting_type='gbdt', 
                                                   max_depth = 16, 
                                                   objective='regression',
                                                   max_bin=200,
                                                   feature_fraction_seed=7,
                                                   min_data_in_leaf=2,
                                                   n_estimators = 512))])

In [316]:
model.fit(train[FEATS], train[TARGET]);

In [317]:
from sklearn.model_selection import cross_val_score

In [318]:
scores = cross_val_score(model, 
                         train[FEATS], 
                         train[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, 
                         n_jobs=-1)

In [321]:
import numpy as np
np.mean(-scores)

535.9946274825448

In [322]:
from sklearn.model_selection import RandomizedSearchCV

In [323]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [8, 16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(train[FEATS], train[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    2.3s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   22.9s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   25.2s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   33.6s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:   44.3s
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:   

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('num',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                     

In [324]:
grid_search.best_params_

{'preprocessor__num__imputer__strategy': 'median',
 'regressor__max_depth': 16,
 'regressor__n_estimators': 256}

In [325]:
grid_search.best_score_

-534.7503001667853

In [326]:
y_pred = grid_search.predict(predict[FEATS])

In [327]:
predictions = model.predict(predict[FEATS])

In [328]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [329]:
submission_df.to_csv('../predict_diamonds/diamonds_lgb1.csv', index=False)