In [1]:
# Dependencies

# Data Processing and Exploration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical Exploration Library
import scipy.stats as stats
from minepy import MINE

# Feature Engineering
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import preprocessing

# Modeling
import xgboost as xgb
from sklearn import linear_model
from sklearn import model_selection
from sklearn import grid_search
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor

# Self-Implemented
import preprocessing as PRE
import modeling as MOD



In [2]:
file1 = 'train.csv'
file2 = 'test.csv'
df = PRE.get_data(file1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df_missing_num['MasVnrArea']= df_missing_num['MasVnrArea'].fillna(0)


In [3]:
# Training (Grid Search) Set
train_i, train_l, a, b = PRE.split_train_test(df, frac=0.33)
ab = pd.concat([a, b], axis=1)

# Training (Base Models) Set
cv_i, cv_l, test_i, test_l = PRE.split_train_test(ab, frac=0.5)

In [4]:
# Random Forest
rf_grid = RandomForestRegressor()
params_rf = {'n_estimators':(16, 18, 20, 22, 24), 
          'min_samples_split':(2, 3, 4, 5, 6)}

# Lasso Regression
lasso_grid = linear_model.Lasso()
params_lasso = {'alpha':(0.00005, 0.0001, 0.0005, 0.05),
         'fit_intercept':(True,False),
         'normalize':(True,False)}

# Ridge Regression
ridge_grid = linear_model.Ridge()
params_ridge = {'alpha':(0.00005, 0.0001, 0.0005, 0.05),
         'fit_intercept':(True,False),
         'normalize':(True,False)}

# Extreme Gradient Boosting
xgboost_grid = xgb.XGBRegressor()
params_xgboost = {'max_depth': (2, 3, 4),
                  'n_estimators': (100, 125),
                  'min_child_weight': (2, 3, 4),
                  'learning_rate': (0.1, 0.2)
                  }

In [5]:
'''
# Get best hyperparameters for each model
rf = MOD.get_params(rf_grid, params_rf, train_i, train_l)
lasso = MOD.get_params(lasso_grid, params_lasso, train_i, train_l)
ridge = MOD.get_params(ridge_grid, params_ridge, train_i, train_l)
xgboost =  MOD.get_params(xgboost_grid, params_xgboost, train_i, train_l)
'''

'\n# Get best hyperparameters for each model\nrf = MOD.get_params(rf_grid, params_rf, train_i, train_l)\nlasso = MOD.get_params(lasso_grid, params_lasso, train_i, train_l)\nridge = MOD.get_params(ridge_grid, params_ridge, train_i, train_l)\nxgboost =  MOD.get_params(xgboost_grid, params_xgboost, train_i, train_l)\n'

In [6]:
(rf, lasso, ridge, xgboost) = ({'min_samples_split': 5, 'n_estimators': 22},
 {'alpha': 5e-05, 'fit_intercept': True, 'normalize': True},
 {'alpha': 0.0005, 'fit_intercept': False, 'normalize': True},
 {'learning_rate': 0.1,
  'max_depth': 4,
  'min_child_weight': 2,
  'n_estimators': 125})

# Make new models using best hyperparameters
rf_base = RandomForestRegressor(**rf)
lasso_base = linear_model.Lasso(**lasso)
ridge_base = linear_model.Ridge(**ridge)
xgboost_base = xgb.XGBRegressor(**xgboost)

# Put base models into a list
base_models = [rf_base, lasso_base, ridge_base, xgboost_base]


In [7]:
# Fit new base models to training set
rf_base.fit(train_i, train_l)
lasso_base.fit(train_i, train_l)
ridge_base.fit(train_i, train_l)
xgboost_base.fit(train_i, train_l)

# Predict on training set
cv1 = pd.DataFrame(rf_base.predict(cv_i))
cv2 = pd.DataFrame(lasso_base.predict(cv_i))
cv3 = pd.DataFrame(ridge_base.predict(cv_i))
cv4 = pd.DataFrame(xgboost_base.predict(cv_i))

# Turn predictions into an "input" df for grid search on meta model
train_stack = pd.concat([cv1, cv2, cv3, cv4], axis=1)

In [8]:
meta_model = linear_model.Ridge(**ridge)
meta_model.fit(train_stack, cv_l)

Ridge(alpha=0.0005, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001)

In [9]:
# Predict on testing set
test1 = pd.DataFrame(rf_base.predict(test_i))
test2 = pd.DataFrame(lasso_base.predict(test_i))
test3 = pd.DataFrame(ridge_base.predict(test_i))
test4 = pd.DataFrame(xgboost_base.predict(test_i))

# Turn predictions into an "input" df for final testing on meta model
test_stack = pd.concat([test1,test2,test3, test4], axis=1)

In [10]:
# RandomForest, Lasso, Ridge final tests
[MOD.score_final(model, test_i, test_l) for model in base_models]

[0.16116750674362815,
 0.14527243398219661,
 0.1517558757389291,
 0.13251927673851879]

In [11]:
# Meta Model (Ridge) final test
MOD.score_final(meta_model, test_stack, test_l)

0.13335923505065383

In [15]:
	df = PRE.get_df('train.csv')

	train = True
	test = False
	if 'SalePrice' not in list(df.columns):
		test = True
		train = False

	# Drop features missing more than 10% of rows
	drop = df.isnull().sum()/df.shape[0]
	drop_list = list(drop[drop > 0.1].index)
	df = df.drop(drop_list, axis=1)

	# Separate remaining features with missing rows from df
	keep = df.isnull().sum() / df.shape[0]
	keep_list = keep[keep > 0].index 
	df_missing = df[keep_list]
	df = df.drop(keep_list, axis=1)

	# Label continuous (num) columns
	if train:
		num = ['GarageYrBlt', 'MasVnrArea']
	else:
		num = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'TotalBsmtSF', 'MasVnrArea']

	# Label categorical (cat) columns
	cat = []
	for i in  df_missing.columns:
	    if i not in num:
	        cat.append(i)

	# Separate cont and cat 
	df_missing_num = df_missing[num]
	df_missing_cat = df_missing[cat]

In [16]:
cat

['MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond']