In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.style as style
style.use('fivethirtyeight')
import helper

In [2]:
colors = ["#FF0B04", "#F1BE48",
           "#B9975B", "#8B5B29",
           "#524727",
         ]
sns.set_palette(sns.color_palette(colors))

In [3]:
# importing data
housing = pd.read_csv('Ames_Housing_Price_Data.csv', index_col=0,low_memory = False)
# data processing
train, test = helper.data_processing_wrapper(housing,
                                               num_to_cat_list = ['MSSubClass','MoSold'],
                                             remove_PID = False
                                        )

# feature engineering wrapper
train, test = helper.feature_engineering_wrapper(train, test)

# importing school feature
schools = pd.read_csv('schoolFeatures.csv',index_col = 0)
school_keep = [
    'PID',
    'closestSchool'
]
schools = schools[school_keep]

# merge school feature onto original data set.
train = train.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')
test = test.merge(schools, how = 'left', left_on = 'PID', right_on = 'PID')

train = train.dropna(subset=['closestSchool'])
train = train.reset_index(drop=True)

test = test.dropna(subset=['closestSchool'])
test = test.reset_index(drop=True)

In [4]:
train_raw = train.copy()
test_raw = test.copy()

X_train = train_raw.drop(['SalePrice','PID'],axis='columns')
y_train = np.log(train_raw['SalePrice'])
X_test = test_raw.drop(['SalePrice','PID'],axis='columns')
y_test = np.log(test_raw['SalePrice'])

In [5]:
coef_df = pd.read_csv('lasso_coef.csv',index_col=0)

In [6]:
selected_features = list(coef_df['features'])

In [7]:
cat_feats = X_train.select_dtypes(['object']).columns.to_list()
num_feats = X_train.select_dtypes(['int64','float64']).columns.to_list()

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ('tf1',OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_feats)],remainder='passthrough')

X_train_transformed = preprocessor.fit_transform(X_train)

columns_transformed = preprocessor.named_transformers_['tf1'].get_feature_names(input_features = cat_feats)
new_columns = list(columns_transformed) + num_feats

X_train_transformed = pd.DataFrame(X_train_transformed,columns=new_columns)

X_test_transformed = preprocessor.transform(X_test)
X_test_transformed = pd.DataFrame(X_test_transformed,columns=new_columns)

In [9]:
X_train = X_train_transformed[selected_features]
X_test = X_test_transformed[selected_features]

# XGBoost

In [10]:
boost = xgb.XGBRegressor()

param_grid = {
    'n_estimators':[1,10,100],
    'max_depth':[20,50],
    'eta':[0.1,0.3,0.5]
}

cv = RepeatedKFold(n_splits=4, n_repeats=5, random_state=42)

grid = GridSearchCV(boost, param_grid, scoring='r2', cv=cv, n_jobs=-1)

grid.fit(X_train, y_train)

print(grid.best_score_)
print(grid.best_params_)

0.9316554588904126
{'eta': 0.1, 'max_depth': 50, 'n_estimators': 100}


In [11]:
grid.score(X_train, y_train)

0.9999678413571278

In [12]:
grid.score(X_test, y_test)

0.9143127321077821