# RF including Feature Engineering

In [148]:
import pandas as pd
import numpy as np
from helper import * 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

In [149]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 200)

# Import data and clean with wrapper

In [150]:
# importing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)
# data processing
train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )

# feature engineering wrapper
train, test = feature_engineering_wrapper(train, test)

# importing school feature
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

# merge school feature onto original data set.
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [151]:
#housing started out with 81 columns (2580 rows)
#feature engineering - ended up with 129 columns (including PID and SalePrice)-- (added 48 new columns) (2477 rows between train and test)

# Split into predictors and target

In [152]:
train_raw = train.copy()
test_raw = test.copy()

train_X = train_raw.drop(['SalePrice','PID'],axis='columns')
train_y = np.log(train_raw['SalePrice'])
test_X = test_raw.drop(['SalePrice','PID'],axis='columns')
test_y = np.log(test_raw['SalePrice'])

In [153]:
train_y_log = train_y.rename('LogSalePrice')
test_y_log = test_y.rename('LogSalePrice')

In [154]:
cat_feats = train_X.select_dtypes(['object']).columns.to_list()
num_feats = train_X.select_dtypes(['int','float']).columns.to_list()

In [155]:
len(cat_feats) # 32 categorical features (43 originally in housing dataset)
len(num_feats) #95 numeric features (38 originally)

95

# Dummify 

In [156]:
#Preprocessing / Dummification
preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')

train_X_transformed = preprocessor.fit_transform(train_X)

In [157]:
#Get one-hot encoded column names 
columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features= cat_feats)
new_columns = list(columns_transformed)+num_feats

#Place one-hot encoded train X into dataframe 
train_X_transformed = pd.DataFrame(train_X_transformed,columns=new_columns)

#Repeat for test X 
test_X_transformed = preprocessor.transform(test_X)
test_X_transformed = pd.DataFrame(test_X_transformed,columns=new_columns)

# Filter down Features Based off of LASSO 

In [158]:
#Features selected by LASSO model (non-zero coefficients)
coef_df = pd.read_csv('lasso_coef.csv',index_col=0) #Hayden shared this on Slack

In [159]:
selected_features = list(coef_df['features'])

In [160]:
print('Number of LASSO selected features: '+ str(len(selected_features)))

Number of LASSO selected features: 140


In [161]:
print('Number of total features after preprocessing: ' + str(len(train_X_transformed.columns.to_list())))

Number of total features after preprocessing: 323


In [162]:
#Filter dataframes down to the select_features 
train_X= train_X_transformed[selected_features]
test_X = test_X_transformed[selected_features]

# Random Forest Modeling 

In [163]:
#Modeling 

In [164]:
#1. No Tuning

In [165]:
rf = RandomForestRegressor(random_state=0)

In [166]:
rf.fit(train_X, train_y_log)

RandomForestRegressor(random_state=0)

In [167]:
print('Cross Val score: ', cross_val_score(rf, train_X, train_y_log, cv=3))
print('Cross Val score mean: ', cross_val_score(rf, train_X, train_y_log, cv=3).mean())
print('Train score: ',rf.score(train_X,train_y_log))
print('Test score: ',rf.score(test_X,test_y_log))

Cross Val score:  [0.92803767 0.92461461 0.9188618 ]
Cross Val score mean:  0.9238380290736682
Train score:  0.98945671606562
Test score:  0.8984127748003494


In [None]:
#2. With Tuning 

In [142]:
# Number of trees in random forest
n_estimators = [100,200,400,600,1000]
# Number of features to consider at every split
max_features = list(range(10,140,20)) + ['auto','sqrt']
# Maximum number of levels in tree
max_depth = list(range(10,111,10)) + ['None']
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [143]:
random_grid

{'n_estimators': [100, 200, 400, 600, 1000],
 'max_features': [10, 30, 50, 70, 90, 110, 130, 'auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 'None'],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 10],
 'bootstrap': [True, False]}

In [None]:
rf = RandomForestRegressor()