In [1]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv("output/train_clean.csv")
predict = pd.read_csv("output/predict_clean.csv")

##### Train Test Split

In [3]:
X = train.drop(columns='price')
y = train.price

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(32364, 10)
(32364,)
(8091, 10)
(8091,)


In [4]:
print(f"-------HistGradientBoostingRegressor-------")
model = HistGradientBoostingRegressor()
model = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model,X,y, cv=10, n_jobs=-1)
print(f"HistGradientBoosting Accuracy {np.mean(scores)}")

-------HistGradientBoostingRegressor-------
RMSE 550.05
HistGradientBoosting Accuracy 0.9799864752450489


##### GridSearchCV

In [5]:
params={"loss": ['least_squares', 'least_absolute_deviation', 'poisson'],
        "max_depth":[10, 25, 50, 75, 100],
        "min_samples_leaf": [10, 20, 25, 30, 35, 40, 45, 50]}
hgbr = HistGradientBoostingRegressor()
grid = GridSearchCV(hgbr,verbose=1,n_jobs=-1,param_grid=params,cv=5)

In [6]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  8.3min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 11.9min finished


GridSearchCV(cv=5, estimator=HistGradientBoostingRegressor(), n_jobs=-1,
             param_grid={'loss': ['least_squares', 'least_absolute_deviation',
                                  'poisson'],
                         'max_depth': [10, 25, 50, 75, 100],
                         'min_samples_leaf': [10, 20, 25, 30, 35, 40, 45, 50]},
             verbose=1)

In [7]:
grid.best_params_

{'loss': 'poisson', 'max_depth': 100, 'min_samples_leaf': 10}

In [15]:
print(f"-------HistGradientBoostingRegressor-------")
model_grid = HistGradientBoostingRegressor(max_depth=25, min_samples_leaf=25) 
model_grid = model.fit(X_train,y_train)
y_pred = model_grid.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model_grid,X,y, cv=10, n_jobs=-1)
print(f"HistGradientBoosting Accuracy {np.mean(scores)}")

#subm_2.csv

-------HistGradientBoostingRegressor-------
RMSE 565.03
HistGradientBoosting Accuracy 0.9801965590229873


In [8]:
print(f"-------HistGradientBoostingRegressor-------")
model_grid = HistGradientBoostingRegressor(loss='poisson', max_depth=100, min_samples_leaf=10) 
model_grid = model.fit(X_train,y_train)
y_pred = model_grid.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model_grid,X,y, cv=10, n_jobs=-1)
print(f"HistGradientBoosting Accuracy {np.mean(scores)}")

-------HistGradientBoostingRegressor-------
RMSE 549.75
HistGradientBoosting Accuracy 0.9802148936541084


In [19]:
y_train_t = train.price
X_train_t = train.drop(columns='price')
X_test_t = predict

model3 = HistGradientBoostingRegressor(max_depth= 30, min_samples_leaf=2) 
model3 = model3.fit(X_train_t, y_train_t)
y_pred = model3.predict(X_test_t)

predict['price'] = y_pred
sub = predict[['id','price']]
sub.to_csv('subm_.csv', index=False, header=True)

##### GridSearchCV 2

In [9]:
params={"loss": ['least_squares', 'least_absolute_deviation', 'poisson'],
        "max_depth":[50, 75, 100, 125, 150, 200],
        "min_samples_leaf": [2, 5, 10, 20, 25, 30]}
hgbr = HistGradientBoostingRegressor()
grid = GridSearchCV(hgbr,verbose=1,n_jobs=-1,param_grid=params,cv=5)

In [10]:
grid.fit(X_train,y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   44.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 11.6min finished


GridSearchCV(cv=5, estimator=HistGradientBoostingRegressor(), n_jobs=-1,
             param_grid={'loss': ['least_squares', 'least_absolute_deviation',
                                  'poisson'],
                         'max_depth': [50, 75, 100, 125, 150, 200],
                         'min_samples_leaf': [2, 5, 10, 20, 25, 30]},
             verbose=1)

In [11]:
grid.best_params_

{'loss': 'least_squares', 'max_depth': 150, 'min_samples_leaf': 2}

In [12]:
print(f"-------HistGradientBoostingRegressor-------")
model_grid = HistGradientBoostingRegressor(loss='least_squares', max_depth=150, min_samples_leaf=2) 
model_grid = model.fit(X_train,y_train)
y_pred = model_grid.predict(X_test)
print("RMSE", round(np.sqrt(mean_squared_error(y_test,y_pred)),2))
scores = cross_val_score(model_grid,X,y, cv=10, n_jobs=-1)
print(f"HistGradientBoosting Accuracy {np.mean(scores)}")

-------HistGradientBoostingRegressor-------
RMSE 544.39
HistGradientBoosting Accuracy 0.9802441208379464
