In [1]:
from functions import *
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, Imputer, OneHotEncoder

## Retrieve and load the data

In [2]:
getHousingData( url = HOUSING_URL, path = HOUSING_PATH )

In [3]:
housing = loadData()
housing['ocean_proximity'], categories = housing['ocean_proximity'].factorize()

housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0


## Make a training/test split

In [4]:
#Divide by 1.5 to limit the number of income categories
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)

#Label those above 5 as 5
housing["income_cat"].where( housing["income_cat"] < 5, 5.0, inplace = True )

split = StratifiedShuffleSplit( n_splits = 1, test_size = 0.2, random_state = 42 )
for trainIndex, testIndex in split.split( housing, housing["income_cat"] ):
    trainSet = housing.loc[trainIndex]
    testSet = housing.loc[testIndex]

In [5]:
for set_ in ( trainSet, testSet):
    set_.drop("income_cat", axis=1, inplace=True)
    
trainLabels = trainSet["median_house_value"].copy()
testLabels = testSet["median_house_value"].copy()

## Define pipeline

In [6]:
housing_num = testSet.drop('ocean_proximity', axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

numPipeline = Pipeline([ ('selector', DataFrameSelector(num_attribs)),
                         ('imputer', Imputer(strategy="median")),
                         ('attribsAdder', CombinedAttributesAdder()),
                         ('stdScaler', StandardScaler()), ])

catPipeline = Pipeline( [ ('selector', DataFrameSelector(cat_attribs)),
                          ('catEncoder', OneHotEncoder(sparse = False)), ])

fullPipeline = FeatureUnion( transformer_list = [ ("numPipeline", numPipeline),
                                                  ("catPipeline", catPipeline), ] )

In [7]:
prepTrain = fullPipeline.fit_transform(trainSet)

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal

svr = SVR()

params = { "kernel" : [ "linear", "rbf" ],
            "C" : reciprocal(20, 200000),
            "gamma" : expon(scale=1.0)
         }

rndSearch = RandomizedSearchCV( svr, param_distributions = params,
                                n_iter = 50, cv = 5, scoring='neg_mean_squared_error',
                                verbose = 2, n_jobs = 1 ) #random_state=42))

rndSearch.fit(prepTrain, trainLabels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] C=23.3118894003, gamma=0.639967336193, kernel=linear ............
[CV]  C=23.3118894003, gamma=0.639967336193, kernel=linear, total= 8.4min
[CV] C=23.3118894003, gamma=0.639967336193, kernel=linear ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.5min remaining:    0.0s


[CV]  C=23.3118894003, gamma=0.639967336193, kernel=linear, total= 8.5min
[CV] C=23.3118894003, gamma=0.639967336193, kernel=linear ............
[CV]  C=23.3118894003, gamma=0.639967336193, kernel=linear, total= 9.1min
[CV] C=23.3118894003, gamma=0.639967336193, kernel=linear ............
[CV]  C=23.3118894003, gamma=0.639967336193, kernel=linear, total= 8.2min
[CV] C=23.3118894003, gamma=0.639967336193, kernel=linear ............
[CV]  C=23.3118894003, gamma=0.639967336193, kernel=linear, total= 6.4min
[CV] C=57.7588881314, gamma=0.163357422105, kernel=linear ............
[CV]  C=57.7588881314, gamma=0.163357422105, kernel=linear, total=10.9min
[CV] C=57.7588881314, gamma=0.163357422105, kernel=linear ............
[CV]  C=57.7588881314, gamma=0.163357422105, kernel=linear, total=11.3min
[CV] C=57.7588881314, gamma=0.163357422105, kernel=linear ............
[CV]  C=57.7588881314, gamma=0.163357422105, kernel=linear, total=11.7min
[CV] C=57.7588881314, gamma=0.163357422105, kernel=linea

In [None]:
nmse = rndSearch.best_score_
rmse = np.sqrt(-nmse)
rmse

In [None]:
bestParams = rndSearch.best_params_
importances = rndSearch.best_estimator_.feature_importances_