In [53]:
import pandas as pd
import numpy as np
import multiprocessing as mp
cores_to_leave = 2
search = True # Boolean for if we run a random search
run = False # Boolean for model determining runs

In [54]:
housing = pd.read_csv('./housing.csv')

# Preprocessing

In [55]:
# Split into test and train sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(housing.copy(), test_size=0.2) # Works on dataframes
y_train = train['median_house_value'].values # Train targets
y_test = test['median_house_value'].values # Test targets
train.drop(['median_house_value'], axis=1, inplace=True) # Train: Leave only features
test.drop(['median_house_value'], axis=1, inplace=True) # Test: Leave only features
train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6167,-117.93,34.09,35.0,782.0,153.0,499.0,163.0,4.2062,<1H OCEAN
8421,-118.35,33.93,26.0,3156.0,857.0,2394.0,787.0,3.01,<1H OCEAN
10790,-117.91,33.63,32.0,1122.0,233.0,557.0,223.0,3.5388,<1H OCEAN
1114,-121.59,39.78,18.0,945.0,205.0,385.0,207.0,2.1838,INLAND
8786,-118.34,33.78,25.0,11016.0,1626.0,4168.0,1584.0,8.1782,NEAR OCEAN


In [56]:
# Make an object for selecting specific variables of a DataFrame
# Will be useful for building pipelines later as we can just input a dataframe
from sklearn.base import BaseEstimator, TransformerMixin

# Class MUST have a fit and transform function as this is what functions sklearn will look for in the pipeline
# Thus we must watch the syntax
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes): # attributes represents a list of columns you want to select
        self.attributes = attributes 
    # fit: This would be any transformations you want to perform (in this case we are just selecting columns)
    # Note that you must include the y=None to match sklearn syntax
    def fit(self, X, y=None): 
        return self
    # transform: return selected columns
    def transform(self, X): # X is a dataframe input
        return X[self.attributes].values # Get numpy array of selected attributes

In [57]:
# First step, we want to impute the missing categorical values
# NOTE: must use train data ONLY to impute, don't want to touch test data
from sklearn.impute import SimpleImputer
numeric_cols = train.select_dtypes([np.number]).columns.tolist() # Get a list of numeric columns
selector_num = DataFrameSelector(numeric_cols) # Object for selecting numeric columns
train_impute = selector_num.transform(train) # Transform the data
imputer = SimpleImputer(missing_values=np.nan, strategy='median') # Make the imputer - will use on test data later
imputer.fit(train_impute) # Fit the data
X_impute = imputer.transform(train_impute) # Transform the data

In [58]:
# One hot encode the categorical data
from sklearn.preprocessing import OneHotEncoder
categorical_cols = train.select_dtypes(['object']).columns.tolist() # Categorical columns
selector_cat = DataFrameSelector(categorical_cols) # Initialize selector
train_onehot = selector_cat.transform(train) # Transform the data
onehot = OneHotEncoder() # Generate OneHotEncoder object
onehot.fit(train_onehot) # Fit
X_onehot = onehot.transform(train_onehot) # Transform the data

In [59]:
# Standard scaler on numerical data
# Note: Don't include targets here!
# Per: https://stackoverflow.com/questions/26584971/how-to-not-standarize-target-data-in-scikit-learn-regression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
x_scaler = StandardScaler() # initialize object
x_scaler.fit(X_impute) # Fit imputed data
X_scaled = x_scaler.transform(X_impute)

# Pipeline Structure
* I think all of the above can be done in a pipeline structure
* Basically performing all these steps in a defined sequence
* Good to document it out once though just for practice

In [60]:
# Create preprocessing pipelines
from sklearn.pipeline import Pipeline, FeatureUnion

# Numerical feature pipe
numerical_pipe = Pipeline([
    ('num_select', DataFrameSelector(numeric_cols)),
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Categorical feature pipe
categorical_pipe = Pipeline([
    ('cat_select', DataFrameSelector(categorical_cols)),
    ('onehot', OneHotEncoder())
])

# Combine into pre-processing pipeline single object
prep_pipe = FeatureUnion([
    ('numerical', numerical_pipe),
    ('categorical', categorical_pipe)
])

# NOTE: Can access elements of a feature union for random/grid search per https://www.kaggle.com/edolatabadi/feature-union-with-grid-search
# Can then run everything through these at once!

# Model Selection
* Select sample models
* Create a scoring object 
* Evaluate and see which models perform best

In [61]:
# Models to test
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
models = {'RandomForest':RandomForestRegressor(),
         'AdaBoost':AdaBoostRegressor(),
         'GradBoost':GradientBoostingRegressor()}

In [62]:
# Fit pipeline on training data
prep_pipe.fit(train, y_train)
X_train = prep_pipe.transform(train)

In [63]:
# Iterate over models
from sklearn.model_selection import cross_val_score
run = False # Boolean for this cell

# Function for printing out scores
def print_scores(score_array):
    print('Errors: ', score_array)
    print('Mean Error: ', score_array.mean())
    print('Std: ', score_array.std())
    
if run:
    for k, model in models.items():
        print('Model: ' + k)
        m_scores = cross_val_score(model, X_train, y_train, # Have to flatten transformed y_data
                                  scoring='neg_mean_squared_error', cv=10)
        m_scores = np.sqrt(-m_scores) # Because using negative mean squared error
        print_scores(m_scores)

# Randomized Search
* RandomForest was the best performing
* Let's use a randomized search to find best parameters

In [64]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
# Need to make a model pipeline
full_pipe = Pipeline([
    ('prep', prep_pipe),
    ('reg', RandomForestRegressor())
])
full_pipe

Pipeline(steps=[('prep',
                 FeatureUnion(transformer_list=[('numerical',
                                                 Pipeline(steps=[('num_select',
                                                                  DataFrameSelector(attributes=['longitude',
                                                                                                'latitude',
                                                                                                'housing_median_age',
                                                                                                'total_rooms',
                                                                                                'total_bedrooms',
                                                                                                'population',
                                                                                                'households',
                                                     

In [None]:
# Set param grid
param_grid = {
    'prep__numerical__scaler':[StandardScaler(), MinMaxScaler()],
    'reg__bootstrap':[True, False],
    'reg__n_estimators':[10, 50, 100, 200, 300],
    'reg__max_features':[2, 4, 6, 8, 10]
}

if search:
    # Run Search
    searcher = GridSearchCV(full_pipe, param_grid, cv=20, n_jobs=4)
    search_res = searcher.fit(train, y_train)

In [None]:
# What are our best parameters?
print('Best Params: ', search_res.best_params_)
all_results = pd.DataFrame.from_dict(search_res.cv_results_) # Dataframe of all results
all_results.to_csv('./grid_search_results')
print(all_results.head())

# Set best model params
# Note: could also use search_res.best_estimator_ to get the best performing model
full_pipe = full_pipe.set_params(**search_res.best_params_) # Set pipeline with best parameters

In [None]:
# Now, let's do another cross val with out best model
scores = cross_val_score(full_pipe, train, y_train, # Have to flatten transformed y_data
                          scoring='neg_mean_squared_error', cv=10)
scores = np.sqrt(-scores)
print_scores(scores)