In [98]:
import numpy as np
import pandas as pd
import scipy.stats as st
from sklearn import preprocessing, ensemble, model_selection

In [99]:
# cast certain columns to float64 since sklearn treats int as categorical
def cast_to_float(df, col_names):
    for col in col_names:
        df[col] = df[col].astype(float, copy=False)

In [100]:
def encode_onehot(df):
    categorical_col_names = df.dtypes[df.dtypes == 'object'].index.values
    return pd.get_dummies(df, columns=categorical_col_names)

In [101]:
# Load data
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [102]:
# prepare training set
df_X_train = df_train.drop('SalePrice', axis=1)
series_y_train = df_train['SalePrice'].astype(float)

# cast type of some int64 variables to float to avoid treatment as categorical
# need_float_col_names = ['LotArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
#                        'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
#                        'ScreenPorch', 'PoolArea', 'MiscVal']
# cast_to_float(df_X_train, need_float_col_names)

# one hot encoding of categorical features
df_X_train = encode_onehot(df_X_train)
df_test = encode_onehot(df_test)

# creating matrices for sklearn
X_train = df_X_train.values.astype(float)
y_train = series_y_train.values.astype(float)
X_test = df_test.values

In [103]:
# prepare tuning params for grid search
nums_trees = [x for x in range(200, 2600, 100)]
loss_criteria = ['mse', 'mae']
feature_fractions = [0.1, 0.2, 0.3]
min_node_sizes = [x for x in range(5, 30, 5)]
tuning_params = {'n_estimators': nums_trees,
                 'criterion': loss_criteria,
                 'max_features': feature_fractions,
                 'min_samples_leaf': min_node_sizes}

In [104]:
# make a grid
clf = model_selection.GridSearchCV(ensemble.RandomForestRegressor(oob_score=True), tuning_params, cv=10)
clf.fit(X_train, y_train)
print("Best parameters set found on training set:")
print()
print(clf.best_params_)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').