In [47]:
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import pandas as pd
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
import time

#StackingEstimator = function call
#BaseEstimator = All estimators should specify all the parameters that can be set at the class level in their __init__
#TransformerMixin = 
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    #Since __init__ is used, StackingEstimator = The data is expected to be stored in a 2D data structure
        #, where the first index is over features and the second is over samples. i.e.>> len(data[key]) == n_samples
    def __init__(self, estimator):
        self.estimator = estimator
        
    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
    
        #add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        #add class prediction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed

In [48]:
time_start = time.time()

train = pd.read_csv (r"C:\Users\Osula\Documents\Projects\Mercedes-Benz\train.csv")
test = pd.read_csv (r"C:\Users\Osula\Documents\Projects\Mercedes-Benz\test.csv")
print(train.head(), train.shape, test.shape)

   ID       y  X0 X1  X2 X3 X4 X5 X6 X8  ...   X375  X376  X377  X378  X379  \
0   0  130.81   k  v  at  a  d  u  j  o  ...      0     0     1     0     0   
1   6   88.53   k  t  av  e  d  y  l  o  ...      1     0     0     0     0   
2   7   76.26  az  w   n  c  d  x  j  x  ...      0     0     0     0     0   
3   9   80.62  az  t   n  f  d  x  l  e  ...      0     0     0     0     0   
4  13   78.02  az  v   n  f  d  h  d  n  ...      0     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     1     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 378 columns] (4209, 378) (4209, 377)


In [49]:
# find all categorical features
cf = train.select_dtypes(include=['object']).columns
print(cf)

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')


In [50]:
#Apply LabelEncoder to categorical features
for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder() 
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))
        
print('train: ', train.head(), train.shape)
print('test: ', test.head(), test.shape)

train:     ID       y  X0  X1  X2  X3  X4  X5  X6  X8  ...   X375  X376  X377  X378  \
0   0  130.81  37  23  20   0   3  27   9  14  ...      0     0     1     0   
1   6   88.53  37  21  22   4   3  31  11  14  ...      1     0     0     0   
2   7   76.26  24  24  38   2   3  30   9  23  ...      0     0     0     0   
3   9   80.62  24  21  38   5   3  30  11   4  ...      0     0     0     0   
4  13   78.02  24  23  38   5   3  14   3  13  ...      0     0     0     0   

   X379  X380  X382  X383  X384  X385  
0     0     0     0     0     0     0  
1     0     0     0     0     0     0  
2     0     0     1     0     0     0  
3     0     0     0     0     0     0  
4     0     0     0     0     0     0  

[5 rows x 378 columns] (4209, 378)
test:     ID  X0  X1  X2  X3  X4  X5  X6  X8  X10  ...   X375  X376  X377  X378  \
0   1  24  23  38   5   3  26   0  22    0  ...      0     0     0     1   
1   2  46   3   9   0   3   9   6  24    0  ...      0     0     1     0   
2   3 

In [51]:
for levels in train[cf].columns:
    print ("---- %s ----" % levels)
    print (train[levels].value_counts())

---- X0 ----
52    360
10    349
51    324
23    313
46    306
50    300
41    269
32    227
40    195
49    182
36    181
24    175
9     151
45    106
15    103
34     75
30     73
11     67
48     36
6      35
39     34
8      34
31     32
26     27
18     25
0      21
22     19
12     18
35     18
16     18
47     17
21     16
38     16
4      14
37     11
25     11
19     11
17     10
44     10
28      6
14      4
29      3
43      2
1       2
33      1
2       1
3       1
Name: X0, dtype: int64
---- X1 ----
1     833
20    598
3     592
13    590
23    408
19    251
10    203
0     143
4     121
16     82
24     52
26     46
22     37
6      33
14     32
21     31
9      29
25     23
7      23
11     22
15     19
12     17
17      9
8       6
5       3
2       3
18      3
Name: X1, dtype: int64
---- X2 ----
19    1659
5      496
9      415
37     367
11     265
42     153
38     137
43      94
30      87
29      81
17      63
25      54
0       47
44      29
35      25
33      25

In [52]:
## drop columns with only one value
k = train.loc[:,(train.apply(pd.Series.nunique) == 1)].columns.tolist()
print(k)

['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']


In [53]:
train_drop = train.drop(k, axis = 1)
test_drop = test.drop(k, axis = 1)

In [54]:
#save columns list before adding the decomposition components
#list = Converts a tuple/sequence into list.
#set = creating a list that ignores all duplicates
usable_columns = list(set(train.columns) - set(['y']))
print(usable_columns)

['X290', 'X330', 'X187', 'X323', 'X209', 'X256', 'X125', 'X224', 'X26', 'X152', 'X139', 'X37', 'X44', 'X147', 'X11', 'X231', 'X301', 'X237', 'X165', 'X366', 'X196', 'X340', 'X200', 'X279', 'X385', 'X122', 'X167', 'X228', 'X274', 'X371', 'X91', 'X159', 'X251', 'X281', 'X84', 'X318', 'X184', 'X14', 'X273', 'X183', 'X331', 'X189', 'X40', 'X150', 'X246', 'X336', 'X16', 'X4', 'X17', 'X46', 'X223', 'X247', 'X284', 'X363', 'X73', 'X195', 'X204', 'X162', 'X173', 'X21', 'X257', 'X82', 'X357', 'X212', 'X156', 'X169', 'X270', 'X378', 'X232', 'X74', 'X78', 'X203', 'X110', 'X131', 'X182', 'X27', 'X218', 'X297', 'X311', 'X133', 'X118', 'X24', 'X320', 'ID', 'X52', 'X382', 'X42', 'X98', 'X230', 'X123', 'X288', 'X197', 'X32', 'X277', 'X179', 'X59', 'X135', 'X53', 'X265', 'X263', 'X130', 'X113', 'X63', 'X250', 'X18', 'X164', 'X208', 'X56', 'X99', 'X226', 'X337', 'X344', 'X329', 'X291', 'X34', 'X317', 'X264', 'X254', 'X142', 'X35', 'X127', 'X117', 'X275', 'X328', 'X260', 'X45', 'X170', 'X276', 'X77', 'X1

## Principal component analysis (PCA)
**Linear dimensionality reduction** using **Singular Value Decomposition** of the data to project it to a lower dimensional space.

It uses the **LAPACK** implementation of the **_full SVD_** or a **_randomized truncated SVD_** by the method of Halko et al. 2009, depending on the *shape of the input data* and the *number of components* to extract.

It can also use the **scipy.sparse.linalg ARPACK** implementation of the **_truncated SVD_**.

Notice that this class **_does not_** support **sparse input**. See **_TruncatedSVD_** for an alternative with sparse data.

In [55]:
## Adding in PCA, FA, etc.
from sklearn.decomposition import PCA
n_comp = 10 #Number of components to keep, if not set all components are kept: n_components == min(n_samples, n_features)
                #if n_components == ‘mle’ and svd_solver == ‘full’
                #, Minka’s MLE is used to guess the dimension if 0 < n_components < 1 and svd_solver == ‘full’
                #, select the number of components such that the amount of variance that needs to be explained 
                #is greater than the percentage specified by n_components n_components cannot be equal to n_features 
                #for svd_solver == ‘arpack’.
                
r_state = 2017 #random_state = Pseudo Random Number generator seed control. If None, use the numpy.random singleton. 
                    #Used by svd_solver == ‘arpack’ or ‘randomized’.
    
pca = PCA(n_components=n_comp, random_state = r_state)
pca2_results_train = pca.fit_transform(train_drop.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_drop)
print('train: ', pca2_results_train, pca2_results_train.shape)
print('test: ', pca2_results_test, pca2_results_test.shape)

train:  [[ -4.20591806e+03   3.74113607e-03  -3.95586587e-02 ...,   4.10594534e+00
    1.64261012e+00  -4.98593291e-01]
 [ -4.19990961e+03  -5.32835499e-02   1.78049019e+00 ...,  -4.77141762e-01
    9.30734173e-01  -6.26763229e-01]
 [ -4.19891168e+03   1.64742769e+01   1.38064828e+01 ...,   1.17196099e+00
    1.70121051e+00  -4.36638237e-01]
 ..., 
 [  4.20599752e+03   3.08713176e+01   1.60994784e+01 ...,   3.14349853e+00
   -3.35521928e-01   1.04878608e-01]
 [  4.20899773e+03   2.54358916e+01   2.87739952e+00 ...,  -3.00067866e+00
    1.61686969e+00   2.07532238e+00]
 [  4.21099776e+03  -1.90654871e+01  -1.08794911e+01 ...,   9.30625134e-01
    6.80522968e-01   6.37402084e-01]] (4209, 10)
test:  [[ -4.20492117e+03   1.64075037e+01   1.36275938e+01 ...,  -1.62546766e+00
    2.41023593e+00   1.39799383e+00]
 [ -4.20396516e+03  -1.54761828e+01  -9.29573483e+00 ...,   4.02584553e+00
    1.99656288e+00  -4.20565776e-01]
 [ -4.20298015e+03   1.27465266e+01  -4.12623303e+00 ...,  -9.94777470

In [61]:
for i in range(0, n_comp):
    print(pca2_results_train[:,i])

[-4205.91805677 -4199.90960823 -4198.91167793 ...,  4205.99752281
  4208.99773408  4210.99776075]
[  3.74113607e-03  -5.32835499e-02   1.64742769e+01 ...,   3.08713176e+01
   2.54358916e+01  -1.90654871e+01]
[ -0.03955866   1.78049019  13.80648279 ...,  16.09947845   2.87739952
 -10.87949109]
[ 13.23697213  11.42255123  11.67891572 ...,   3.88662363   1.88931611
  11.50243592]
[ -4.33478539  -5.08774191 -15.07352322 ...,   9.27298592  -6.05284658
  -5.08192779]
[-21.25489826 -25.18849186 -23.0570205  ...,  22.51485373  24.59409031
  25.05360854]
[-2.75236339 -4.5074752  -2.23281044 ...,  0.81964321 -4.29653933
  0.57595582]
[ 4.10594534 -0.47714176  1.17196099 ...,  3.14349853 -3.00067866
  0.93062513]
[ 1.64261012  0.93073417  1.70121051 ..., -0.33552193  1.61686969
  0.68052297]
[-0.49859329 -0.62676323 -0.43663824 ...,  0.10487861  2.07532238
  0.63740208]


In [34]:
from sklearn.decomposition import FactorAnalysis

FA = FactorAnalysis(n_components=n_comp, random_state = r_state)
FA_results_train = FA.fit_transform(train_drop.drop(["y"], axis=1))
FA_results_test = FA.transform(test_drop)
print('train: ', FA_results_train)
print('test: ', FA_results_test)

train:  [[ 1.01752745  0.32106122  0.24191141 ..., -0.26241573  8.68341608
   1.86284317]
 [ 0.54053354  1.18222149  0.01315693 ..., -0.12814816 -0.22289462
   1.71872336]
 [ 0.932777    2.19708049  0.73814656 ...,  0.03552488  0.1989201
   1.75451444]
 ..., 
 [ 0.54053271  1.18222281  0.01315536 ..., -0.12814629 -0.22285169
  -1.73564192]
 [-0.77868697  0.58024745 -1.1395795  ...,  0.04310846 -0.01572709
  -1.66080455]
 [-0.77868697  0.58024745 -1.1395795  ...,  0.04310846 -0.01572708
  -1.66162654]]
test:  [[ 0.932777    2.19708049  0.73814656 ...,  0.03552488  0.19892007
   1.75698017]
 [-1.63626944 -0.02871052  1.34605866 ...,  0.15177823  0.0779865
   1.72778262]
 [ 1.69574441  0.32042195  0.67617552 ..., -0.14550339  0.23878725
   1.69383851]
 ..., 
 [-0.01571956 -1.29641109 -1.20155054 ..., -0.1379198   0.02414007
  -1.72230396]
 [ 1.30350012 -0.69443573 -0.04881568 ..., -0.30917455 -0.18298451
  -1.79878417]
 [-0.77868697  0.58024745 -1.1395795  ...,  0.04310846 -0.01572709
  -

### Truncated Singular Value Decomposition (SVD)
Since our data input is spare, we try the Truncated SVD as a good alternative.

The **TruncatedSVD** transformer performs *linear dimensionality reduction* by means of **truncated singular value decomposition (SVD)**. Contrary to *PCA*, this estimator **_does not_** center the data before computing the singular value decomposition. This means it **_can work_** with **scipy.sparse matrices** efficiently.

In particular, truncated SVD **_works on_** term *count/tf-idf matrices* as returned by the vectorizers in **sklearn.feature_extraction.text**. In that context, it is known as **latent semantic analysis (LSA)**.

This estimator **_supports_** *two algorithms*: a fast **randomized SVD solver**, and a *“naive” algorithm* that uses **ARPACK** as an eigensolver on (X * X.T) or (X.T * X), whichever is more efficient.

In [35]:
from sklearn.decomposition import FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state = r_state)
tsvd_results_train = tsvd.fit_transform(train_drop.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test_drop)

### Independent Component Analysis (ICA)
**FastICA**: a fast algorithm for Independent Component Analysis.

**Independent component analysis** separates a *multivariate signal* into *additive subcomponents* that are **_maximally independent_**. It is implemented in scikit-learn using the **Fast ICA** algorithm. Typically, ICA is **_not used_** for *reducing dimensionality* but *__for separating superimposed signals__*. 

Since the ICA model __*does not*__ include a noise term, for the model to be correct, whitening **_must_** be applied. This can be done internally using the **whiten argument** or manually using one of the **PCA variants**.

It is classically used to *separate mixed signals* (a problem known as *blind source separation*) and can also be used as yet another *non linear decomposition* that finds components with **_some sparsity_**.

In [36]:
# ICA
ica = FastICA(n_components=n_comp, random_state = r_state)
ica2_results_train = ica.fit_transform(train_drop.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_drop)

In [59]:
print(ica2_results_train, ica2_results_train.shape)

[[ 0.00385651 -0.01110805 -0.05618819 ...,  0.00307634 -0.00470934
  -0.01422798]
 [ 0.00909563 -0.00259149 -0.06391768 ...,  0.00133614 -0.00620145
  -0.02523413]
 [ 0.00784402  0.00400473 -0.06131861 ..., -0.0139092  -0.02396659
  -0.01549358]
 ..., 
 [-0.00869924  0.0180974   0.05986589 ..., -0.00461554  0.0182967
   0.00522132]
 [-0.0086041  -0.00609768  0.0599258  ...,  0.01094057 -0.01884846
  -0.01673516]
 [-0.00746153 -0.02025349  0.06081357 ...,  0.01074364 -0.02179523
   0.0071692 ]] (4209, 10)


In [60]:
for i in range(0, n_comp):
    print(ica2_results_train[:,i])

[ 0.00385651  0.00909563  0.00784402 ..., -0.00869924 -0.0086041
 -0.00746153]
[-0.01110805 -0.00259149  0.00400473 ...,  0.0180974  -0.00609768
 -0.02025349]
[-0.05618819 -0.06391768 -0.06131861 ...,  0.05986589  0.0599258
  0.06081357]
[ 0.00768804  0.00711497 -0.00637006 ..., -0.02063642 -0.02786758
  0.01874777]
[ 0.03317202 -0.00065743  0.01162993 ...,  0.02376639 -0.02780403
  0.00559164]
[-0.01560285 -0.00717329 -0.02654893 ..., -0.01562375 -0.02403196
 -0.00111385]
[ 0.02022935  0.02037331  0.02278592 ...,  0.01579916  0.01044916
  0.01238623]
[ 0.00307634  0.00133614 -0.0139092  ..., -0.00461554  0.01094057
  0.01074364]
[-0.00470934 -0.00620145 -0.02396659 ...,  0.0182967  -0.01884846
 -0.02179523]
[-0.01422798 -0.02523413 -0.01549358 ...,  0.00522132 -0.01673516
  0.0071692 ]


### Gaussian Random Projection (GRP)
*Reduce dimensionality* through Gaussian random projection: the components of the *random matrix* are drawn from 
**N(0, 1 / n_components)**.

The **sklearn.random_projection.GaussianRandomProjection** reduces the dimensionality by projecting the original input space on a *randomly generated matrix* where components are drawn from the following **distribution N(0, 1\n_components)**.

In [37]:
# GRP
#esp = strictly positive float, optional (default=0.1). Parameter to control the quality of the embedding according to the 
    #Johnson-Lindenstrauss lemma when n_components is set to ‘auto’. 
    #Smaller values lead to better embedding and higher number of dimensions (n_components) in the target projection space.
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state = r_state)
grp_results_train = grp.fit_transform(train_drop.drop(["y"], axis=1))
grp_results_test = grp.transform(test_drop)

### Sparse Random Projection (SRP)
*Reduce dimensionality* through sparse random projection

**Sparse random matrix**s an alternative to *dense random projection matrix* that guarantees similar embedding quality while being much more __*memory efficient*__ and allowing **_faster computation_** of the projected data.

If we note **s = 1 / density** the components of the *random matrix* are drawn from:
- **-sqrt(s) / sqrt(n_components)** with probability **1 / 2s**
- 0 with probability **1 - 1 / s**
- **+sqrt(s) / sqrt(n_components)** with probability **1 / 2s**

In [38]:
# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state = r_state)
srp_results_train = srp.fit_transform(train_drop.drop(["y"], axis=1))
srp_results_test = srp.transform(test_drop)

In [62]:
# Append decomposition components to datasets
for i in range(0, n_comp):
    train_drop['pca_' + str(i)] = pca2_results_train[:,i]
    test_drop['pca_' + str(i)] = pca2_results_test[:, i]
    
    #train_drop['fa' + str(i)] = FA_results_train[:,i-1]
    #test_drop['fa' + str(i)] = FA_results_test[:, i-1]

    train_drop['ica_' + str(i)] = ica2_results_train[:,i]
    test_drop['ica_' + str(i)] = ica2_results_test[:, i]

    #train_drop['tsvd_' + str(i)] = tsvd_results_train[:,i-1]
    #test_drop['tsvd_' + str(i)] = tsvd_results_test[:, i-1]

    #train_drop['grp_' + str(i)] = grp_results_train[:,i-1]
    #test_drop['grp_' + str(i)] = grp_results_test[:, i-1]

    #train_drop['srp_' + str(i)] = srp_results_train[:,i-1]
    #test_drop['srp_' + str(i)] = srp_results_test[:, i-1]


In [63]:
train_drop.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,pca_5,ica_5,pca_6,ica_6,pca_7,ica_7,pca_8,ica_8,pca_9,ica_9
0,0,130.81,37,23,20,0,3,27,9,14,...,-21.254898,-0.015603,-2.752363,0.020229,4.105945,0.003076,1.64261,-0.004709,-0.498593,-0.014228
1,6,88.53,37,21,22,4,3,31,11,14,...,-25.188492,-0.007173,-4.507475,0.020373,-0.477142,0.001336,0.930734,-0.006201,-0.626763,-0.025234
2,7,76.26,24,24,38,2,3,30,9,23,...,-23.057021,-0.026549,-2.23281,0.022786,1.171961,-0.013909,1.701211,-0.023967,-0.436638,-0.015494
3,9,80.62,24,21,38,5,3,30,11,4,...,-25.484677,-0.027332,-4.362057,0.019089,-1.8883,-0.004061,2.218975,0.016281,0.212048,-0.025615
4,13,78.02,24,23,38,5,3,14,3,13,...,-8.54808,-0.032017,3.716605,0.020845,-1.74161,-0.014339,2.17468,-0.001974,1.2821,0.017934


In [58]:
## My response variables
y_train = train["y"]
y_train.head()

0    130.81
1     88.53
2     76.26
3     80.62
4     78.02
Name: y, dtype: float64

In [59]:
## My baseline prediction: an average of the y-values
y_mean = np.mean(y_train)
y_median = np.median(y_train)
print('mean: ', y_mean)
print('median: ', y_median)

mean:  100.66931812782121
median:  99.15


In [68]:
id_test = test['ID'].values
print('id_test: ', id_test)

#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values
print('train: ', finaltrainset, finaltrainset.shape)

id_test:  [   1    2    3 ..., 8413 8414 8416]
train:  [[0 0 1 ..., 0 0 0]
 [0 0 1 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 1 ..., 1 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]] (4209, 377)


In [61]:
'''Train the xgb model then predict the test data'''

# Prepare dict of params for XGBoost to run with
xgb_params = {
    ## Number of Trees
    'n_trees': 395, 
    ## Learning Rate; default = 0.3
    'eta': 0.0065,
    ## Depth of Trees
    'max_depth': 3,
    ## Bagging 50% of the training set
    'subsample': 0.50,
    #'colsample_bytree': 0.75,
    'min_child_weight': 34,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    # Base Prediction = mean(target)
    'base_score': y_mean,
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

In [62]:
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

In [63]:
cv_result = xgb.cv(xgb_params, 
                   dtrain, 
                   num_boost_round=1500,
                   #nfold=10,
                   early_stopping_rounds=25,
                   verbose_eval=50, 
                   show_stdv=False,
                   seed=2017
                  )

[0]	train-rmse:12.6296	test-rmse:12.6244
[50]	train-rmse:10.7971	test-rmse:10.7987
[100]	train-rmse:9.69451	test-rmse:9.70491
[150]	train-rmse:9.05374	test-rmse:9.07756
[200]	train-rmse:8.6935	test-rmse:8.73086
[250]	train-rmse:8.49311	test-rmse:8.54569
[300]	train-rmse:8.37528	test-rmse:8.44598
[350]	train-rmse:8.30265	test-rmse:8.39152
[400]	train-rmse:8.25229	test-rmse:8.36468
[450]	train-rmse:8.21382	test-rmse:8.35007
[500]	train-rmse:8.18218	test-rmse:8.34125
[550]	train-rmse:8.15497	test-rmse:8.33481
[600]	train-rmse:8.12776	test-rmse:8.33175


In [64]:
num_boost_rounds = len(cv_result)
num_boost_rounds

594

In [65]:
## Train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)
y_pred = model.predict(dtest)

In [66]:
'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(
        learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55,
        min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()
)

In [67]:
stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))




In [69]:
'''R2 Score on the entire Train data when averaging'''

print('R2 Score on Training Set:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))
print('Time Elapsed: {}'.format(time.time() - time_start))

R2 Score on Training Set:
0.583184414444
Time Elapsed: 26.8678035736084


In [70]:
'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('xgboost_Stacked2_Fudged.csv', index=False)