In [2]:
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 10 10:50:58 2017

@author: Administrator
"""

import pandas as pd
import os
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR


def load_housing_data(housing_path="E:"):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

rowdata = load_housing_data()
train_set, test_set = train_test_split(rowdata, test_size=0.2, random_state=42)
housing = train_set.drop("median_house_value", axis=1)
housing_labels = train_set["median_house_value"].copy()




# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
#imputer = Imputer(strategy="median")
housing_num = housing.drop("ocean_proximity", axis=1)
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', Imputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', LabelBinarizer()),
    ])
    
    

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])
    
housing_prepared = full_pipeline.fit_transform(housing)
param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]
svm_reg = SVR()
grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2, n_jobs=4)
grid_search.fit(housing_prepared, housing_labels)
score = grid_search.best_score_
rmse = np.sqrt(-score)
print(rmse)

print(grid_search.best_params_)
#X = imputer.fit_transform(housing_num)
#housing_tr = pd.DataFrame(X, columns=housing_num.columns,
#                          index = list(housing.index.values))
#housing_tr = pd.DataFrame(X, columns=housing_num.columns)
#housing_cat = housing["ocean_proximity"]

#encoder = LabelBinarizer()
#housing_cat_1hot = encoder.fit_transform(housing_cat)
#print(housing_cat_1hot)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  4.1min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 27.2min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 44.9min finished


NameError: name 'np' is not defined

In [3]:
score

-4875198139.7772045

In [4]:
import numpy as np

In [5]:
rmse = np.sqrt(-score)

In [6]:
rmse

69822.6191128434

In [7]:
grid_search.best_params_

{'C': 30000.0, 'kernel': 'linear'}

In [8]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, reciprocal


param_distribs = {
        'kernel': ['linear', 'rbf'],
        'C': reciprocal(20, 200000),
        'gamma': expon(scale=1.0),
    }

rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,
                                n_iter=50, cv=5, scoring='neg_mean_squared_error',
                                verbose=2, n_jobs=4, random_state=42)
rnd_search.fit(housing_prepared, housing_labels)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  6.0min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 31.6min
[Parallel(n_jobs=4)]: Done 250 out of 250 | elapsed: 54.6min finished


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=4,
          param_distributions={'kernel': ['linear', 'rbf'], 'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000006842438>, 'gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x00000000068423C8>},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score=True, scoring='neg_mean_squared_error',
          verbose=2)

In [9]:
negative_mse = rnd_search.best_score_
rmse = np.sqrt(-negative_mse)
rmse

54181.375126266546

In [10]:
rnd_search.best_params_

{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

def indices_of_top_k(arr, k):
    return np.sort(np.argpartition(np.array(arr), -k)[-k:])

class TopFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_importances, k):
        self.feature_importances = feature_importances
        self.k = k
    def fit(self, X, y=None):
        self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)
        return self
    def transform(self, X):
        return X[:, self.feature_indices_]

In [14]:
k=5

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
forest_reg = RandomForestRegressor(random_state=42)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

NameError: name 'display_scores' is not defined

In [17]:
forest_rmse_scores

array([ 50272.17082323,  52216.74747168,  49552.61701475,  52581.63770809,
        52344.9596699 ,  48082.75161799,  48105.9404091 ,  53153.42831405,
        51987.15552132,  52324.43235668])

In [18]:
forest_rmse_scores.mean()
forest_rmse_scores.std()

1806.3422322703643

In [19]:
forest_rmse_scores.mean()

51062.18409067957

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, housing_labels)
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,
                                scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)

In [22]:
forest_rmse_scores

array([ 50272.17082323,  52216.74747168,  49552.61701475,  52581.63770809,
        52344.9596699 ,  48082.75161799,  48105.9404091 ,  53153.42831405,
        51987.15552132,  52324.43235668])

In [23]:
forest_rmse_scores.mean()

51062.18409067957

In [24]:
forest_rmse_scores.std()

1806.3422322703643

In [25]:
feature_importances = forest_reg.feature_importances_

In [26]:
feature_importances

array([ 0.10580485,  0.10167456,  0.05234232,  0.02356162,  0.02361795,
        0.03196095,  0.01795136,  0.49153733,  0.0042198 ,  0.14050444,
        0.00051082,  0.000805  ,  0.005509  ])

In [27]:
top_k_feature_indices = indices_of_top_k(feature_importances, k)
top_k_feature_indices

array([0, 1, 2, 7, 9], dtype=int64)

In [29]:
cat_one_hot_attribs = list(cat_encoder.classes_)
attributes = num_attribs + cat_one_hot_attribs
attributes

NameError: name 'cat_encoder' is not defined

In [30]:
encoder = LabelBinarizer()
housing_cat_1hot = encoder.fit_transform(housing_cat)

NameError: name 'housing_cat' is not defined

In [31]:
housing_cat = housing["ocean_proximity"]

In [32]:
housing_cat_1hot = encoder.fit_transform(housing_cat)

In [34]:
cat_one_hot_attribs = list(encoder.classes_)
attributes = num_attribs + cat_one_hot_attribs
attributes

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 '<1H OCEAN',
 'INLAND',
 'ISLAND',
 'NEAR BAY',
 'NEAR OCEAN']

In [35]:
np.array(attributes)[top_k_feature_indices]

array(['longitude', 'latitude', 'housing_median_age', 'median_income',
       'INLAND'], 
      dtype='<U18')

In [36]:
sorted(zip(feature_importances, attributes), reverse=True)[:k]

[(0.49153733341839667, 'median_income'),
 (0.14050443900812062, 'INLAND'),
 (0.10580485061233087, 'longitude'),
 (0.10167455897627582, 'latitude'),
 (0.0523423206649304, 'housing_median_age')]

In [37]:
preparation_and_feature_selection_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k))
])

housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)

In [38]:
housing_prepared_top_k_features[0:3]

array([[ 1.27258656, -1.3728112 ,  0.34849025, -0.326196  ,  0.        ],
       [ 0.70916212, -0.87669601,  1.61811813, -0.03584338,  0.        ],
       [-0.44760309, -0.46014647, -1.95271028,  0.14470145,  0.        ]])

In [40]:
rnd_search.best_params_

{'C': 157055.10989448498, 'gamma': 0.26497040005002437, 'kernel': 'rbf'}

In [41]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(rnd_search.best_params_))
])

In [43]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(C=157055.10989448498, gamma=0.26497040005002437, kernel='rbf'))
])

In [45]:
feature_importances

array([ 0.10580485,  0.10167456,  0.05234232,  0.02356162,  0.02361795,
        0.03196095,  0.01795136,  0.49153733,  0.0042198 ,  0.14050444,
        0.00051082,  0.000805  ,  0.005509  ])

In [46]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,NEAR OCEAN
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,INLAND


In [48]:
full_pipeline

FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'])), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='me...an_proximity'])), ('cat_encoder', LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False))]))],
       transformer_weights=None)

In [49]:
topf = TopFeatureSelector(feature_importances, k)

In [51]:
topf.fit(housing_prepared)

TopFeatureSelector(feature_importances=array([ 0.1058 ,  0.10167,  0.05234,  0.02356,  0.02362,  0.03196,
        0.01795,  0.49154,  0.00422,  0.1405 ,  0.00051,  0.0008 ,  0.00551]),
          k=5)

In [52]:
topf.fit_transform(housing_prepared)

array([[ 1.27258656, -1.3728112 ,  0.34849025, -0.326196  ,  0.        ],
       [ 0.70916212, -0.87669601,  1.61811813, -0.03584338,  0.        ],
       [-0.44760309, -0.46014647, -1.95271028,  0.14470145,  0.        ],
       ..., 
       [ 0.59946887, -0.75500738,  0.58654547, -0.49697313,  0.        ],
       [-1.18553953,  0.90651045, -1.07984112,  0.96545045,  0.        ],
       [-1.41489815,  0.99543676,  1.85617335, -0.68544764,  0.        ]])

In [54]:
svrtest = SVR(C=157055.10989448498, gamma=0.26497040005002437, kernel='rbf')

In [55]:
aaa = topf.fit_transform(housing_prepared)
svrtest.fit(aaa,housing_labels)

SVR(C=157055.10989448498, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [70]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(C=157055.10989448498, gamma=0.26497040005002437, kernel='rbf'))
])

In [73]:
prepare_select_and_predict_pipeline

Pipeline(steps=[('preparation', FeatureUnion(n_jobs=1,
       transformer_list=[('num_pipeline', Pipeline(steps=[('selector', DataFrameSelector(attribute_names=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income'])), ('imputer', Imputer(... gamma=0.26497040005002437, kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False))])

In [75]:
some_data = housing.iloc[:4]
some_labels = housing_labels.iloc[:4]

In [83]:
class addLabels(BaseEstimator, TransformerMixin):

  def __init__(self, labels):
    self.labels = labels

  def fit(self, X, y=None):
    return self

  def transform(self, X):
    return X, self.labels.reshape(-1,0)

In [84]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('addlabels', addLabels(housing_labels)),
    ('svm_reg', SVR(C=157055.10989448498, gamma=0.26497040005002437, kernel='rbf'))
])

In [87]:
cattest_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', CategoricalEncoder(encoding="onehot-dense")),
    ])

In [88]:
full_pipeline_test = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cattest_pipeline", cat_pipeline),
    ])

In [89]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline_test),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
    ('svm_reg', SVR(C=157055.10989448498, gamma=0.26497040005002437, kernel='rbf'))
])

In [93]:
prepare_select_and_predict_pipeline = Pipeline([
    ('preparation', full_pipeline),
    ('feature_selection', TopFeatureSelector(feature_importances, k)),
])
prepare_select_and_predict_pipeline.fit(housing)
some_data = housing.iloc[:4]
some_data = prepare_select_and_predict_pipeline.transform(some_data)
some_labels = housing_labels.iloc[:4]

In [94]:
print("Predictions:\t", svrtest.predict(some_data))
print("Labels:\t\t", list(some_labels))

Predictions:	 [ 153376.96410053  256317.09907971  224043.75680673  112961.03103976]
Labels:		 [103000.0, 382100.0, 172600.0, 93400.0]
