In [0]:
import pandas as pd
import requests
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
import math
from sklearn.metrics import mean_squared_error
import datetime
from sklearn.model_selection import GridSearchCV

In [0]:
train = pd.read_csv('diamond_train_categoric.csv', index_col=0)
test = pd.read_csv('diamond_test_categoric.csv', index_col=0)

In [44]:
train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,price
0,1.21,5,3,4,63.0,57.0,6.73,6.7,4.23,6134
1,0.28,3,7,6,64.0,56.0,4.14,4.17,2.66,532
2,0.42,4,5,5,61.2,58.0,4.86,4.82,2.96,1103
3,0.26,5,3,8,61.1,57.0,4.16,4.12,2.53,600
4,1.1,2,4,3,63.4,57.0,6.52,6.55,4.14,4997


**Discarded models**

- StackingRegressor(estimators=estimators,final_estimator=RandomForestRegressor(n_estimators=10,random_state=42(RMSE=1007.458)
- LinearRegression() (RMSE=1224.332)
- Linear_model.SGDRegressor(max_iter=1000, tol=1e-3) (RMSE=63728481.219)
- DecisionTreeRegressor(random_state=0) (RMSE=495.5739)
- KNeighborsRegressor() (RMSE=601.5956)
- Linear_model.Lasso(alpha=0.1) (RMSE=1178.4844)
- AdaBoostRegressor(random_state=0, n_estimators=100) (RMSE=1293.2789)

In [45]:
X = train.drop(columns=['price'])
y = train['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(32276, 9) (8069, 9) (32276,) (8069,)


In [0]:
models = {
    "forest200": RandomForestRegressor(n_estimators=200),
    "forest400": RandomForestRegressor(n_estimators=400),
    "boosting_750": GradientBoostingRegressor(n_estimators=750),
    "boosting_800": GradientBoostingRegressor(n_estimators=800),
    "boosting_900": GradientBoostingRegressor(n_estimators=900),
    "boosting_950": GradientBoostingRegressor(n_estimators=950)
}

In [61]:
for name,m  in models.items():
  print(f"Starting training at {datetime.datetime.now()}")
  print(f"Training {name}...")
  m.fit(X_train, y_train)
  print(f"Train complete at {datetime.datetime.now()}")

Starting training at 2020-05-11 16:46:10.736427
Training forest200...
Train complete at 2020-05-11 16:46:38.780494
Starting training at 2020-05-11 16:46:38.780671
Training forest400...
Train complete at 2020-05-11 16:47:38.362335
Starting training at 2020-05-11 16:47:38.362694
Training boosting_750...
Train complete at 2020-05-11 16:48:06.614421
Starting training at 2020-05-11 16:48:06.614571
Training boosting_800...
Train complete at 2020-05-11 16:48:36.825893
Starting training at 2020-05-11 16:48:36.826041
Training boosting_900...
Train complete at 2020-05-11 16:49:10.656799
Starting training at 2020-05-11 16:49:10.657014
Training boosting_950...
Train complete at 2020-05-11 16:49:47.133178


In [62]:
printMetric = lambda label,value:print(f"\t {label}: {round(value,4)}")

for name,m in models.items():
  y_pred = m.predict(X_test)
  print(f"Evaluating model {name}")
  printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

Evaluating model forest200
	 RMSE: 552.6246
Evaluating model forest400
	 RMSE: 553.3494
Evaluating model boosting_750
	 RMSE: 544.5595
Evaluating model boosting_800
	 RMSE: 543.0186
Evaluating model boosting_900
	 RMSE: 541.9241
Evaluating model boosting_950
	 RMSE: 541.4763


In [0]:
model = GradientBoostingRegressor(n_estimators=950).fit(X, y)

In [0]:
y_pred_final = model.predict(test)

In [0]:
pred = pd.DataFrame(y_pred_final).reset_index()
pred.to_csv('submission14.csv', index=False, header=['id', 'price'])

In [53]:
pred.head()

Unnamed: 0,index,0
0,0,488.719823
1,1,1992.024454
2,2,9579.221841
3,3,527.136587
4,4,9456.498686


GridSearchCV



In [54]:
params = {'learning_rate':[0.05], 
           'n_estimators':[800],
           'min_samples_split':[50],
           'min_samples_leaf': [50],
           'subsample': [1],
           'max_depth': [6, 7]}

tuning_gradient = GridSearchCV(estimator=GradientBoostingRegressor(),
param_grid=params, scoring='neg_mean_squared_error',n_jobs=5, cv=5, verbose=2)

tuning_gradient.fit(X_train,y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:  3.0min remaining:  1.3min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:  3.1min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
                            

In [0]:
y_pred_grid = tuning_gradient.predict(X_test)

In [56]:
printMetric("RMSE",math.sqrt(mean_squared_error(y_test, y_pred)))

	 RMSE: 541.8331


In [0]:
tuning_gradient.best_params_

{'learning_rate': 0.05,
 'max_depth': 7,
 'min_samples_leaf': 50,
 'min_samples_split': 50,
 'n_estimators': 800,
 'subsample': 1}

In [0]:
grid_model = tuning_gradient
grid_model.fit(X, y)

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   7 out of  10 | elapsed:  3.2min remaining:  1.4min
[Parallel(n_jobs=5)]: Done  10 out of  10 | elapsed:  3.3min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='ls', max_depth=3,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_n...
                            

In [0]:
grid_predict = grid_model.predict(test)

In [0]:
prediction = pd.DataFrame(grid_predict).reset_index()

In [0]:
prediction.head()

Unnamed: 0,index,0
0,0,391.792435
1,1,1446.895125
2,2,9423.214712
3,3,519.84116
4,4,9623.590716


In [0]:
prediction.to_csv('submission15.csv', index=False, header=['id', 'price'])