In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split 

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

In [4]:
concrete_data = pd.read_csv('Concrete_Data_Yeh.csv')
concrete_data.head()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [5]:
X = concrete_data.drop('csMPa', axis =1)
Y = concrete_data['csMPa']

In [7]:
X = X.drop(['flyash', 'coarseaggregate','fineaggregate'], axis=1)

In [8]:
x_train ,x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [10]:
baseline = GradientBoostingRegressor(max_depth=3, n_estimators=50)
baseline.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [12]:
y_pred = baseline.predict(x_test)
r2_score(y_test, y_pred)

0.8676408063940424

In [13]:
important_features = pd.Series(baseline.feature_importances_, index=X.columns).\
                    sort_values(ascending=False)

important_features


age                 0.382224
cement              0.347722
water               0.097515
slag                0.097084
superplasticizer    0.075455
dtype: float64

In [15]:
gbr = GradientBoostingRegressor(max_depth=3)

In [16]:
parameters ={'n_estimators':[1,5,10,50,100,200,300,400,500]}
 =GridSearchCV(estimator=gbr, param_grid=parameters, cv=3)

In [17]:
gridsearch_reg.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_sampl...=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_estimators': [1, 5, 10, 50, 100, 200, 300, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
gridsearch_reg.best_params_

{'n_estimators': 500}

In [19]:
gbr_best = GradientBoostingRegressor(max_depth=3, n_estimators = gridsearch_reg.best_params_['n_estimators'])
gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [20]:
y_pred = gbr_best.predict(x_test)
r2_score(y_test, y_pred)

0.9321245639615986

In [22]:
gbr_best = GradientBoostingRegressor(max_depth=3, warm_start=True)

In [30]:
min_val_error = float("inf")
error_increasing = 0

for n_estimators in range (1,1000):
    gbr.n_estimators = n_estimators
    gbr.fit(x_train, y_train)
    
    y_pred = gbr.predict(x_test)
    val_error = mean_squared_error(y_test, y_pred)
    
    print('No. of estimators:', gbr.n_estimators)
    print('Validation error', val_error)
                 
    if val_error<min_val_error:
                 min_val_error = val_error 
                 
    else:
        error_increasing+=1
        if error_increasing ==10:
            break

    

No. of estimators: 1
Validation error 245.51245081171714
No. of estimators: 2
Validation error 220.36326337530352
No. of estimators: 3
Validation error 199.78988304013953
No. of estimators: 4
Validation error 178.11169887772397
No. of estimators: 5
Validation error 160.04085885670062
No. of estimators: 6
Validation error 146.5732501113348
No. of estimators: 7
Validation error 134.6639559299548
No. of estimators: 8
Validation error 124.63317886163496
No. of estimators: 9
Validation error 116.29815254782477
No. of estimators: 10
Validation error 109.71325122362087
No. of estimators: 11
Validation error 103.99307520328775
No. of estimators: 12
Validation error 97.29943406750822
No. of estimators: 13
Validation error 92.10458394625957
No. of estimators: 14
Validation error 87.00784953236105
No. of estimators: 15
Validation error 82.3259316564592
No. of estimators: 16
Validation error 78.7572646093002
No. of estimators: 17
Validation error 75.01719827156019
No. of estimators: 18
Validation 

In [31]:
n_estimators

118

In [32]:
x_train ,x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [33]:
gbr_best = GradientBoostingRegressor(max_depth=3, n_estimators = n_estimators)

In [34]:
gbr_best.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=118, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

In [36]:
y_pred =gbr_best.predict(x_test)
r2_score(y_test, y_pred)

0.888077294967838