## Calling lib files

In [1]:
%matplotlib inline
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor
digits = load_boston()



In [2]:
X = pd.DataFrame(digits.data, columns=digits.feature_names)
print(X.head())

      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31   0.0  0.538  6.575  65.2  4.0900  1.0  296.0   
1  0.02731   0.0   7.07   0.0  0.469  6.421  78.9  4.9671  2.0  242.0   
2  0.02729   0.0   7.07   0.0  0.469  7.185  61.1  4.9671  2.0  242.0   
3  0.03237   0.0   2.18   0.0  0.458  6.998  45.8  6.0622  3.0  222.0   
4  0.06905   0.0   2.18   0.0  0.458  7.147  54.2  6.0622  3.0  222.0   

   PTRATIO       B  LSTAT  
0     15.3  396.90   4.98  
1     17.8  396.90   9.14  
2     17.8  392.83   4.03  
3     18.7  394.63   2.94  
4     18.7  396.90   5.33  


In [3]:
y = pd.DataFrame(digits.target, columns=['Price'])
print(y.head())

   Price
0   24.0
1   21.6
2   34.7
3   33.4
4   36.2


## Checking for missing values

In [4]:
# No missing values !
print(X.apply(lambda x: sum(x.isnull())))
print('~'*25)
print(y.apply(lambda x: sum(x.isnull())))

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
dtype: int64
~~~~~~~~~~~~~~~~~~~~~~~~~
Price    0
dtype: int64


## Splitting data for training

In [5]:
X_t = X.iloc[-20:]
y_t = y.iloc[-20:]
y_t = np.ravel(y_t)
# this will be used for retest
print(X_t.shape, y_t.shape)

((20, 13), (20,))


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2)
y_train = np.ravel(y_train)
y_test = np.ravel(y_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((379, 13), (127, 13), (379,), (127,))


## Fitting ensemble models

In [7]:
model = [AdaBoostRegressor(), GradientBoostingRegressor(), BaggingRegressor(), BaggingRegressor(), ExtraTreesRegressor(), RandomForestRegressor()]
for md in model:
    clf = md
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)*100
    print('%r \n\n model accuracy = %2f' % (md, accuracy) +' %\n'+'~'*100)    

AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None) 

 model accuracy = 83.881140 %
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False) 

 model accuracy = 89.445829 %
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0, warm_start=False) 

 model accuracy 

In [35]:
clf = GradientBoostingRegressor(loss='ls', learning_rate=0.15, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=3,max_depth=4)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)*100
print('%r \n\n model accuracy = %2f' % (md, accuracy) +' %\n'+'~'*100)    

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False) 

 model accuracy = 90.161160 %
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~


In [9]:
prediction = clf.predict(X_t)

In [10]:
X_t['Prediction'] = prediction
X_t['Actuals'] = y_t

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [14]:
Diff = ((X_t.Prediction.values-X_t.Actuals.values)/X_t.Actuals.values) 
X_t['Diff'] = Diff

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


## Column 'Diff' shows the difference between the prediction and actual values

In [15]:
X_t

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Prediction,Actuals,Diff
486,5.69175,0.0,18.1,0.0,0.583,6.114,79.8,3.5459,24.0,666.0,20.2,392.68,14.98,18.666631,19.1,-0.022689
487,4.83567,0.0,18.1,0.0,0.583,5.905,53.2,3.1523,24.0,666.0,20.2,388.22,11.45,20.55861,20.6,-0.002009
488,0.15086,0.0,27.74,0.0,0.609,5.454,92.7,1.8209,4.0,711.0,20.1,395.09,18.06,13.706472,15.2,-0.098258
489,0.18337,0.0,27.74,0.0,0.609,5.414,98.3,1.7554,4.0,711.0,20.1,344.05,23.97,9.204003,7.0,0.314858
490,0.20746,0.0,27.74,0.0,0.609,5.093,98.0,1.8226,4.0,711.0,20.1,318.43,29.68,9.21226,8.1,0.137316
491,0.10574,0.0,27.74,0.0,0.609,5.983,98.8,1.8681,4.0,711.0,20.1,390.11,18.07,14.228967,13.6,0.046248
492,0.11132,0.0,27.74,0.0,0.609,5.983,83.5,2.1099,4.0,711.0,20.1,396.9,13.35,19.359647,20.1,-0.036834
493,0.17331,0.0,9.69,0.0,0.585,5.707,54.0,2.3817,6.0,391.0,19.2,396.9,12.01,20.772822,21.8,-0.047118
494,0.27957,0.0,9.69,0.0,0.585,5.926,42.6,2.3817,6.0,391.0,19.2,396.9,13.59,22.077904,24.5,-0.098861
495,0.17899,0.0,9.69,0.0,0.585,5.67,28.8,2.7986,6.0,391.0,19.2,393.29,17.6,21.116468,23.1,-0.085867
