In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn_pandas import CategoricalImputer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel, RFE
import lightgbm as lgb
from xgboost import XGBRegressor
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
from scipy.stats import skew
from scipy.special import boxcox1p

In [36]:
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")

In [37]:
target = train['SalePrice']
train = train.drop(['Id','SalePrice', 'Utilities'], axis=1)
test = test.drop(['Id', 'Utilities'], axis=1)
ntrain = train.shape[0]
ntest = test.shape[0]
all_data = pd.concat((train, test)).reset_index(drop=True)

In [38]:
all_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,,Reg,Lvl,Inside,Gtl,...,0,0,,,,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,,Reg,Lvl,FR2,Gtl,...,0,0,,,,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,,IR1,Lvl,Inside,Gtl,...,0,0,,,,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,,IR1,Lvl,Corner,Gtl,...,0,0,,,,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,,IR1,Lvl,FR2,Gtl,...,0,0,,,,0,12,2008,WD,Normal


In [39]:
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")

In [40]:
all_data["MiscFeature"] = all_data['MiscFeature'].fillna("None")

In [41]:
all_data["Alley"] = all_data["Alley"].fillna("None")

In [42]:
all_data["Fence"] = all_data["Fence"].fillna("None")

In [43]:
all_data["FireplaceQu"] = all_data['FireplaceQu'].fillna("None")

In [44]:
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))

In [45]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

In [46]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

In [47]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

In [48]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

In [49]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

In [50]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

In [51]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")

In [52]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

In [53]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

In [54]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

In [55]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

In [56]:
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

In [57]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)[:30]
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head(20)

Unnamed: 0,Missing Ratio


In [58]:
#MSSubClass=The building class
all_data['MSSubClass'] = all_data['MSSubClass'].apply(str)


#Changing OverallCond into a categorical variable
all_data['OverallCond'] = all_data['OverallCond'].astype(str)


#Year and month sold are transformed into categorical features.
all_data['YrSold'] = all_data['YrSold'].astype(str)
all_data['MoSold'] = all_data['MoSold'].astype(str)

In [59]:
# Adding total sqfootage feature 
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [60]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(10)


Skew in numerical features: 



Unnamed: 0,Skew
MiscVal,21.947195
PoolArea,16.898328
LotArea,12.822431
LowQualFinSF,12.088761
3SsnPorch,11.376065
KitchenAbvGr,4.302254
BsmtFinSF2,4.146143
EnclosedPorch,4.003891
ScreenPorch,3.946694
BsmtHalfBath,3.931594


In [61]:
skewness = skewness[abs(skewness) > 0.75]
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [62]:
cols = ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold')
# process columns, apply LabelEncoder to categorical features
for c in cols:
    lbl = LabelEncoder() 
    lbl.fit(list(all_data[c].values)) 
    all_data[c] = lbl.transform(list(all_data[c].values))

# shape        
print('Shape all_data: {}'.format(all_data.shape))

Shape all_data: (2919, 79)


In [63]:
all_data = pd.get_dummies(all_data)

In [64]:
train = all_data[:ntrain]
test = all_data[ntrain:]

In [65]:
train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,10,5.831328,19.212182,1,1,3,0,2.440268,4,14.187527,...,0,0,0,1,0,0,0,0,1,0
1,5,6.221214,19.712205,1,1,3,0,2.259674,7,14.145138,...,0,0,0,1,0,0,0,0,1,0
2,10,5.91494,20.347241,1,1,0,0,2.440268,4,14.184404,...,0,0,0,1,0,0,0,0,1,0
3,11,5.684507,19.691553,1,1,0,0,2.440268,4,14.047529,...,0,0,0,1,1,0,0,0,0,0
4,10,6.314735,21.32516,1,1,0,0,2.602594,4,14.182841,...,0,0,0,1,0,0,0,0,1,0


In [70]:
params = {
    #'application': 'binary', # for binary classification
#     'num_class' : 1, # used for multi-classes
    'boosting': 'gbdt', # traditional gradient boosting decision tree
    'num_iterations': 100, 
    'learning_rate': 0.1,
    'num_leaves': 16,
    'device': 'cpu', # you can use GPU to achieve faster learning
    'max_depth': -1, # <0 means no limit
    'max_bin': 510, # Small number of bins may reduce training accuracy but can deal with over-fitting
    'lambda_l1': 1, # L1 regularization
    'lambda_l2': 3, # L2 regularization
    'metric' : 'binary_error',
    'subsample_for_bin': 200, # number of samples for constructing bins
    'subsample': 0.75, # subsample ratio of the training instance
    'colsample_bytree': 0.65, # subsample ratio of columns when constructing the tree
    'min_split_gain': 0.2, # minimum loss reduction required to make further partition on a leaf node of the tree
    'min_child_weight': 1, # minimum sum of instance weight (hessian) needed in a leaf
    'min_child_samples': 6# minimum number of data needed in a leaf
}

# Initiate classifier to use
mdl = lgb.LGBMRegressor(boosting_type= 'gbdt',
          n_estimators = 1000,
          objective = 'regression', 
          n_jobs = -1, 
          silent = True,
          max_depth = params['max_depth'],
          max_bin = params['max_bin'], 
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'], 
          min_split_gain = params['min_split_gain'], 
          min_child_weight = params['min_child_weight'], 
          min_child_samples = params['min_child_samples'],
          lambda_l1 = params['lambda_l1'],
          lambda_l2 = params['lambda_l2'],
          colsample_bytree = params['colsample_bytree'],
          num_leaves = params['num_leaves'],
          learning_rate = params['learning_rate'])

# To view the default model parameters:
mdl.get_params().keys()

dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'importance_type', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin', 'lambda_l1', 'lambda_l2'])

In [71]:
mdl.fit(train,target)
predictions = mdl.predict(test)

In [26]:
param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    #'n_estimators': [8,16,24, 100, 200, 300, 500, 1000],
    'num_leaves': [8,12,14,16], # large num_leaves helps improve accuracy but might lead to over-fitting
    #'boosting_type' : ['gbdt'], # for better accuracy -> try dart
    #'objective' : ['binary'],
    #'max_bin':[255, 510], # large max_bin helps improve accuracy but might slow down training progress
    #'random_state' : [500],
    'colsample_bytree' : [0.5, 0.55, 0.6, 0.65],
    'subsample' : [0.5,0.65,0.75,0.85],
    #'reg_alpha' : [1,1.2],
    #'reg_lambda' : [1,1.2,1.4],
    'lambda_l1' : [0,0.5,1],
    'lambda_l2' : [2,3,4],
    'min_split_gain' : [0.1,0.2,0.3],
    'min_child_samples': [5,6,7,8]
    }

grid = GridSearchCV(mdl, param_grid, verbose=1, cv=3, scoring="neg_mean_squared_error", n_jobs=-1)
# Run the grid
grid.fit(train, target)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

Fitting 3 folds for each of 13824 candidates, totalling 41472 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   45.0s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 1218 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 1768 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 2418 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done 4018 tasks      | elapsed: 14.2min
[Parallel(n_jobs=-1)]: Done 4968 tasks      | elapsed: 17.4min
[Parallel(n_jobs=-1)]: Done 6018 tasks      | elapsed: 20.9min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed: 24.8min
[Parallel(n_jobs=-1)]: Done 8418 tasks      | elapsed: 29.1min
[Parallel(n_jobs=-1)]: Done 9768 tasks      | elapsed: 33.6min
[Parallel(n_jobs=-1)]: Done 11218 tasks      

{'colsample_bytree': 0.65, 'lambda_l1': 1, 'lambda_l2': 3, 'learning_rate': 0.1, 'min_child_samples': 6, 'min_split_gain': 0.2, 'num_leaves': 16, 'subsample': 0.75}
-698641012.4042611


In [34]:
sorted(SCORERS.keys())

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'brier_score_loss',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'mutual_info_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'v_measure_score']

In [72]:
sub = pd.read_csv("sample_submission.csv")
print(predictions.shape)
print(sub.shape)

sub['SalePrice'] = predictions
sub.to_csv("submit_results.csv", index=False)
print('done')

(1459,)
(1459, 2)
done
