In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import SimpleImputer

In [2]:
from xgboost import XGBRegressor

In [3]:
train_data_path = 'datasets/train.csv'
test_data_path = 'datasets/test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [4]:
print(train_data.columns)

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [5]:
train_y = train_data.SalePrice
test_bkp = test_data.copy()

In [6]:
train_data = train_data.drop(['Id','SalePrice'],axis=1)
test_data = test_data.drop(['Id'],axis=1)

In [7]:
train_data.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
                  ...   
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object


In [8]:
cols_with_missing = [col for col in train_data.columns if train_data[col].isnull().any() and train_data[col].dtype=='object']
print(cols_with_missing)

['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [9]:
train_data = train_data.drop(cols_with_missing,axis=1)
test_data = test_data.drop(cols_with_missing,axis=1)

In [10]:
low_cardinality_cols = [col for col in test_data.columns if test_data[col].dtype=='object']
print(low_cardinality_cols)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'Heating', 'HeatingQC', 'CentralAir', 'KitchenQual', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']


In [11]:
numeric_cols = [col for col in test_data.columns if test_data[col].dtype in ['int64','float64']]
print(numeric_cols)

['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


In [12]:
my_cols = low_cardinality_cols+numeric_cols

In [13]:
train_predictors = train_data[my_cols]
test_predictors = test_data[my_cols]

In [14]:
train_predictors.dtypes.sample(10)

BsmtHalfBath       int64
BsmtFinSF2         int64
MasVnrArea       float64
HalfBath           int64
LandSlope         object
ExterCond         object
GarageArea         int64
Neighborhood      object
GrLivArea          int64
EnclosedPorch      int64
dtype: object

In [15]:
one_hot_encoded_training_predictors = pd.get_dummies(train_predictors)
one_hot_encoded_test_predictors = pd.get_dummies(test_predictors)
final_train, final_test = one_hot_encoded_training_predictors.align(one_hot_encoded_test_predictors,join='left',axis=1)

In [16]:
final_train.shape

(1460, 218)

In [17]:
final_test.shape

(1459, 218)

In [18]:
my_imputer = SimpleImputer()
train_data = my_imputer.fit_transform(final_train)
test_data = my_imputer.transform(final_test)

In [19]:
print(type(train_data))
print(type(final_train))

<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>


In [20]:
train_data.shape

(1460, 218)

In [21]:
test_data.shape

(1459, 218)

In [23]:
#train_X1, test_X1, train_y1, test_y1 = train_test_split(train_data, train_y, test_size=0.25)

In [48]:
xg_model = XGBRegressor(n_estimators=115,learning_rate=0.05,n_jobs=8)
#xg_model.fit(train_data,train_y,early_stopping_rounds=5,eval_set=[(test_X1,test_y1)],verbose=False)
xg_model.fit(train_data,train_y,verbose=False)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=115,
       n_jobs=8, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [49]:
test_preds = xg_model.predict(test_data)

In [51]:
output = pd.DataFrame({'Id': test_bkp.Id,'SalePrice': test_preds})
output.to_csv('submission4.csv', index=False)

In [52]:
#pred = xg_model.predict(test_X1)
#mean_absolute_error(test_y1,pred)

In [53]:
#pred = xg_model.predict(test_X1)
#mean_absolute_error(test_y1,pred)