In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import numpy as np # linear algebra
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from sklearn import preprocessing
from sklearn.preprocessing import RobustScaler

In [None]:
train = pd.read_csv('../predict_diamonds/diamonds_train.csv')

In [None]:
predict = pd.read_csv('../predict_diamonds/diamonds_test.csv')

In [None]:
TARGET = 'price'

CAT_FEATURES = ['cut', 'color', 'clarity']
NUM_FEATURES = ['carat', 'depth', 'table', 'x', 'y', 'z']

for categorical_feature in CAT_FEATURES:
    
    train[categorical_feature] = train[categorical_feature].astype('category') 
    predict[categorical_feature] = predict[categorical_feature].astype('category')

In [None]:
FEATS = NUM_FEATURES + CAT_FEATURES

In [None]:
numeric_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), 
                ('scaler', RobustScaler())])

In [None]:
categorical_transformer = \
Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing')),
                ('encoder', OrdinalEncoder())])

In [None]:
preprocessor = \
ColumnTransformer(transformers=[('num', numeric_transformer, NUM_FEATURES),
                                ('cat', categorical_transformer, CAT_FEATURES)])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
model= Pipeline(steps=[('preprocessor', preprocessor),
                       ('regressor', LGBMRegressor(boosting_type='gbdt', 
                                                   max_depth = 8, 
                                                   objective='regression',
                                                   max_bin=200,
                                                   feature_fraction_seed=7,
                                                   min_data_in_leaf=2,
                                                   n_estimators = 256))])

In [None]:
model.fit(train[FEATS], train[TARGET]);

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score(model, 
                         train[FEATS], 
                         train[TARGET], 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, 
                         n_jobs=-1)

In [None]:
import numpy as np
np.mean(-scores)

531.5623528395769

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'regressor__n_estimators': [8, 16, 32, 64, 128, 256, 512],
    'regressor__max_depth': [2, 4, 8, 16],
}

grid_search = RandomizedSearchCV(model, 
                                 param_grid, 
                                 cv=5, 
                                 verbose=10, 
                                 scoring='neg_root_mean_squared_error', 
                                 n_jobs=-1,
                                 n_iter=32)

grid_search.fit(train[FEATS], train[TARGET])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    8.8s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done  21 tasks      | elapsed:   13.7s
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  57 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done  68 tasks      | elapsed:   36.6s
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:   44.2s
[Parallel(n_jobs=-1)]: Done  94 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 109 tasks      | elapsed:   57.1s
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 141 tasks      | elapsed:  1

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('preprocessor',
                                              ColumnTransformer(n_jobs=None,
                                                                remainder='drop',
                                                                sparse_threshold=0.3,
                                                                transformer_weights=None,
                                                                transformers=[('num',
                                                                               Pipeline(memory=None,
                                                                                        steps=[('imputer',
                                                                                                SimpleImputer(add_indicator=False,
                                                                                     

In [None]:
grid_search.best_params_

{'preprocessor__num__imputer__strategy': 'mean',
 'regressor__max_depth': 8,
 'regressor__n_estimators': 256}

In [None]:
grid_search.best_score_

-531.5623528395769

In [None]:
y_pred = grid_search.predict(predict[FEATS])

In [None]:
predictions = model.predict(predict[FEATS])

In [None]:
submission_df = pd.DataFrame({'id': predict['id'], 'price': y_pred})

In [None]:
submission_df.to_csv('../predict_diamonds/diamonds_lgb5.csv', index=False)