In [180]:
%matplotlib inline
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer,StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline,FeatureUnion
from pandas.tools.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

In [181]:
data=pd.read_csv("handson-ml/datasets/housing/housing.csv")

In [182]:
data_copy=data.copy()
target=data_copy.pop("median_house_value")
X_train,X_test,y_train,y_test=train_test_split(data_copy,target,test_size=0.2,random_state=42)
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [18]:
print(X_train.shape)

(16512, 10)


In [19]:
print(X_test.shape)

(4128, 10)


In [None]:
data['rooms_household']=data['total_rooms']/data['households']
data['bedrooms_rooms']=data['total_rooms']/data['total_bedrooms']
matrix=data.corr()
print(matrix['median_house_value'].sort_values(ascending=False))

scatter_matrix(data[['median_house_value','median_income','bedrooms_rooms','rooms_household','housing_median_age','latitude','longitude']],figsize=(12,8))

In [183]:
class DataFrame_Selector(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns): 
        self.columns=columns
        
    def fit(self,X,y=None):
        return self # nothing to do 
    
    def transform(self,X,y=None):
        return X[self.columns].values

In [184]:
class Label_Hot_Encoder(BaseEstimator,TransformerMixin):
    
    def __init__(self,columns):
        self.columns=columns
        
        
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        
        data=pd.DataFrame(X,columns=self.columns)
        
        for col in self.columns:
            one_hot=pd.DataFrame(data.pop(col))
            one_hot["value"]=1
            one_hot=one_hot.pivot(index=None,columns=col,values="value")
            one_hot.columns.name=None
            one_hot=one_hot.fillna(0)
            data=data.merge(one_hot,left_index=True,right_index=True)
        return data.values

In [185]:
class AttributeAdder(BaseEstimator,TransformerMixin):
    def __init__(self,columns):
        self.columns=columns
    
    def fit(self, X,y=None):
        return self
    def transform(self, X,y=None):
        rooms_household=X[:,self.columns.index('total_rooms')]/X[:,self.columns.index('households')]
        bedrooms_rooms=X[:,self.columns.index('total_rooms')]/X[:,self.columns.index('total_bedrooms')]
        pop_hhold=X[:,self.columns.index('population')]/X[:,self.columns.index('households')]
        
        
        return np.c_[X,rooms_household,bedrooms_rooms,pop_hhold]
        

In [186]:
num_data=X_train.drop(["ocean_proximity","latitude","longitude"],axis=1)
num_columns=list(num_data)
cat_columns=['ocean_proximity']
num_pipeline=Pipeline([("selector",DataFrame_Selector(num_columns)),
                        ("imputer",Imputer(strategy="median")),
                        ("feat_adder",AttributeAdder(num_columns)),
                        ("scaler",StandardScaler())])

cat_pipeline=Pipeline([("selector",DataFrame_Selector(cat_columns)),
                        ("encoder",Label_Hot_Encoder(cat_columns))])

full_pipeline=FeatureUnion(transformer_list=[("num_pipeline",num_pipeline),
                                            ("cat_pipeline",cat_pipeline)])






In [187]:
X_train_clean=full_pipeline.fit_transform(X_train)
X_test_clean=full_pipeline.transform(X_test)

In [53]:
print(X_train_clean[:,9:15])

[[ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 [ 0.  0.  0.  1.]
 ..., 
 [ 0.  0.  0.  0.]
 [ 0.  0.  0.  0.]
 [ 0.  0.  1.  0.]]


In [142]:
lin_model=LinearRegression()
lin_scores=np.sqrt(-1*cross_val_score(lin_model,X_train_clean,np.c_[y_train],cv=10,scoring='neg_mean_squared_error'))
print("Mean cross val score: "+str(np.mean(lin_scores)))
print("Std deviation scores: "+str(np.std(lin_scores)))

Mean cross val score: 69363.4373762
Std deviation scores: 2368.32051251


In [188]:
forest_model=RandomForestRegressor(n_estimators=30,max_features=8)
forest_scores=np.sqrt(-1*cross_val_score(forest_model,X_train_clean,np.c_[y_train].ravel(),cv=10,scoring='neg_mean_squared_error'))
print("Mean cross val score: "+str(np.mean(forest_scores)))
print("Std deviation scores: "+str(np.std(forest_scores)))

Mean cross val score: 58874.4379231
Std deviation scores: 1499.83542798


In [145]:
neigh_model=KNeighborsRegressor(n_neighbors=5)
neigh_scores=np.sqrt(-1*cross_val_score(neigh_model,X_train_clean,np.c_[y_train].ravel(),cv=10,scoring='neg_mean_squared_error'))
print("Mean cross val score: "+str(np.mean(neigh_scores)))
print("Std deviation scores: "+str(np.std(neigh_scores)))

Mean cross val score: 65548.7835912
Std deviation scores: 1567.53507729


In [149]:
param_grid={'n_estimators':[5,10,20,30],'max_features':[2,4,6,8]}
grid_search=GridSearchCV(forest_model,param_grid,cv=5,scoring='neg_mean_squared_error')
grid_search.fit(X_train_clean,np.c_[y_train].ravel())

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 20, 30], 'max_features': [2, 4, 6, 8]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=0)

In [193]:
#Ensembling

forest_model=RandomForestRegressor(n_estimators=30,max_features=6)
forest_model.fit(X_train_clean,np.c_[y_train].ravel())


neigh_model=KNeighborsRegressor(n_neighbors=5)
neigh_model.fit(X_train_clean,np.c_[y_train].ravel())

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [194]:
out_forest=forest_model.predict(X_test_clean)
print(np.sqrt(mean_squared_error(np.c_[y_test].ravel(),out_forest)))

59875.1848802


In [191]:
out_neigh=neigh_model.predict(X_test_clean)
print(np.sqrt(mean_squared_error(np.c_[y_test].ravel(),out_neigh)))

67215.5608281


In [192]:
output_1=forest_model.predict(X_test_clean)
output_2=neigh_model.predict(X_test_clean)
output=(output_1+output_2)/2

print(np.sqrt(mean_squared_error(np.c_[y_test].ravel(),output)))

60726.6768259
