In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn_pandas import CategoricalImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
import lightgbm as lgb
from xgboost import XGBRegressor
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from scipy.stats import skew
from scipy.special import boxcox1p
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasRegressor # KerasRegressor also works

In [3]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [4]:
target = train['SalePrice']
train = train.drop(['Id','SalePrice', 'Utilities'], axis=1)
test = test.drop(['Id', 'Utilities'], axis=1)
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)

In [5]:
all_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,0,,,,0,12,2008,WD,Normal


In [6]:
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")

In [7]:
all_data["MiscFeature"] = all_data['MiscFeature'].fillna("None")

In [8]:
all_data["Alley"] = all_data["Alley"].fillna("None")

In [9]:
all_data["Fence"] = all_data["Fence"].fillna("None")

In [10]:
all_data["FireplaceQu"] = all_data['FireplaceQu'].fillna("None")

In [11]:
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [12]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

In [13]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

In [14]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

In [15]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

In [16]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

In [17]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

In [18]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")

In [19]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

In [20]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

In [21]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

In [22]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

In [23]:
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

In [24]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [25]:
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [26]:
# Adding total sqfootage feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [27]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)


Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,21.947195
PoolArea,16.898328
LotArea,12.822431
LowQualFinSF,12.088761
3SsnPorch,11.376065
KitchenAbvGr,4.302254
BsmtFinSF2,4.146143
EnclosedPorch,4.003891
ScreenPorch,3.946694
BsmtHalfBath,3.931594


In [28]:
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [29]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (2919, 79)


In [30]:
all_data = pd.get_dummies(all_data)

In [31]:
robust = RobustScaler()
all_data = robust.fit_transform(all_data)

In [32]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [35]:
print(train.shape)
print(test.shape)

(1460, 221)
(1459, 221)


In [None]:
def build_classifier(layer1, dropout1, layer2, dropout2, layer3, dropout3):
    classifier = Sequential()
    classifier.add(Dense(layer1, activation='relu', input_dim=221))
    classifier.add(Dropout(dropout1))
    classifier.add(Dense(layer2, activation='relu'))
    classifier.add(Dropout(dropout2))
    classifier.add(Dense(layer3, activation='relu'))
    classifier.add(Dropout(dropout3))
    classifier.add(Dense(1, activation='linear')) 
    classifier.compile(optimizer='adam',loss='mean_squared_error', metrics=['mean_squared_error']) 
    return classifier


In [32]:
params = {
        #'n_estimators': [300, 500, 1000],
        'batch_size': [48, 256, 512],
        'nb_epoch': [500, 1000, 5000],
        'layer1': [111, 256, 512],
        'dropout1': [0.1, 0.3],
        'layer2': [24, 48, 111],
        'dropout2': [0.1, 0.3],
        'layer3': [6, 12, 24],
        'dropout3': [0.1, 0.3]
        }
regressor = KerasRegressor(build_fn=build_classifier)
grid = GridSearchCV(regressor, params, verbose=1, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
grid.fit(train,target)
print(grid.best_params_)
print(grid.best_score_)

Fitting 3 folds for each of 1728 candidates, totalling 5184 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   59.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  6.5min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed: 15.6min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed: 39.5min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed: 50.9min
[Parallel(n_jobs=-1)]: Done 5184 out of 5184 | elapsed: 54.7min finished
  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


{'colsample_bytree': 0.75, 'gamma': 1, 'learning_rate': 0.01, 'max_depth': 6, 'min_child_weight': 1, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.75}
-647760975.3654748


In [42]:
sub = pd.read_csv("sample_submission.csv")
print(predictions2.shape)
print(sub.shape)

sub['SalePrice'] = predictions2
sub.to_csv("submit_results.csv", index=False)
print('done')

(1459,)
(1459, 2)
done
