In [39]:
# Imports
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [40]:
# Load data
train = pd.read_csv('../data/raw/diamonds_train.csv')
test = pd.read_csv('../data/raw/diamonds_test.csv')
full = pd.read_csv('../data/raw/diamonds.csv')
single = pd.read_csv('../data/raw/single_test.csv')

In [41]:
# Identify columns
NUM_FEATS = ['x', 'y', 'z', 'depth', 'table', 'carat']
CAT_FEATS = ['cut', 'color', 'clarity']
ALL_FEATS = NUM_FEATS + CAT_FEATS
TARGET = 'price'

In [42]:
# transformer. Previous tasks we apply to our columns to make the model more accurate
transformer = ColumnTransformer(transformers=[("scaler", RobustScaler(), NUM_FEATS), 
                                              ("encoder", OneHotEncoder(), CAT_FEATS)])

In [43]:
# Split the data to train the model and test it later
X_train, X_test = train_test_split(full, test_size=0.000001)
print(X_train.shape)
print(X_test.shape)

(53939, 11)
(1, 11)


In [44]:
# Set the pipeline, what to do before the training and the model to train
pipe = Pipeline(steps=[("transformer", transformer),
                       ("model", RandomForestRegressor(n_estimators=512, max_depth=16))], verbose=10)
# pipe = Pipeline(steps=[("transformer", transformer),
#                        ("pca", PCA(0.95)),
#                        ("model", GradientBoostingRegressor())], verbose=10)

In [45]:
# Training the model using pipeline
pipe.fit(X_train[ALL_FEATS], X_train[TARGET])

[Pipeline] ....... (step 1 of 2) Processing transformer, total=   0.3s
[Pipeline] ............. (step 2 of 2) Processing model, total= 2.7min


Pipeline(memory=None,
         steps=[('transformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scaler',
                                                  RobustScaler(copy=True,
                                                               quantile_range=(25.0,
                                                                               75.0),
                                                               with_centering=True,
                                                               with_scaling=True),
                                                  ['x', 'y', 'z', 'depth',
                                                   'table', 'carat']),
                                                 ('encoder',
                                                  OneHotEncoder(categories='auto',


In [46]:
# Predicting our y_test using X_test previously split from the main dataframe
y_test = pipe.predict(X_test[ALL_FEATS])
y_train = pipe.predict(X_train[ALL_FEATS])
# Calculate rmse for both, test and train previously split
rmse_test = mean_squared_error(y_pred=y_test, y_true=X_test[TARGET], squared=False)
rmse_train = mean_squared_error(y_pred=y_train, y_true=X_train[TARGET], squared=False)
r2 = r2_score(y_pred=y_test, y_true=X_test[TARGET])
print(f"test error: {rmse_test}") # 786.32708700068 -> best 521.233209489903
print(f"train error: {rmse_train}") # 708.453678764242 -> best 301.637420203425
print(r2)

test error: 16.23772637488537
train error: 311.8462666595142
nan




In [47]:
# Cross validation, checking results
scores = cross_val_score(pipe, train[ALL_FEATS], train[TARGET], 
                         scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1, verbose=10)
np.mean(-scores) # 733.2874856161409 ->  best 557.9761591423182

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.0min remaining:  3.0min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.0min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.1min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.1min finished


557.8155155759545

In [48]:
# # Model Optimization, Grid Search
# param = {"model__n_estimators" : [16, 32, 64, 128, 256, 512], 'model__max_depth': [2, 4, 8, 16]}
# tuned_pipe = GridSearchCV(pipe, param, cv=10, verbose=10, scoring='neg_root_mean_squared_error', n_jobs=-1)
# tuned_pipe.fit(X_train[ALL_FEATS], X_train[TARGET])
# tuned_pipe.best_params_ {'model__max_depth': 16, 'model__n_estimators': 512}
# tuned_pipe.best_score_ -556.4260213207206

In [49]:
# Predicting new data
# y_new_data = tuned_pipe.predict(test[ALL_FEATS]) # GridSearch
y_new_data = pipe.predict(test[ALL_FEATS])
# y_new_data = pipe.predict(single[ALL_FEATS]) # 2976.18647031
y_new_data

array([2925.93459031, 5586.35317625, 9404.72807476, ..., 3000.87129602,
       2104.43953844,  817.78758568])

In [50]:
sub = pd.DataFrame({"id": test.index, "price": y_new_data})
sub.describe()

Unnamed: 0,id,price
count,13485.0,13485.0
mean,6742.0,3948.029452
std,3892.928525,3954.073616
min,0.0,364.672886
25%,3371.0,942.128646
50%,6742.0,2438.755571
75%,10113.0,5296.411988
max,13484.0,18397.171484


In [None]:
# sub["price"].clip(lower=300, upper=20000, inplace=True)

In [51]:
sub.to_csv("../data/submission/sub_final.csv", index=False)