In [73]:
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from scipy.stats import skew

In [43]:
df = pd.read_csv('input/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
def mean_squared_error_(ground_truth, predictions):
    return mean_squared_error(ground_truth, predictions) ** 0.5
RMSE = make_scorer(mean_squared_error_, greater_is_better=False)

In [17]:
def data_preprocess(train, test):
    
    print(train.shape)
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx], inplace = True)
    
    ntrain = train.shape[0]
    
    all_data = pd.concat((
        train.loc[:, 'MSSubClass':'SaleCondition'],
        test.loc[:, 'MSSubClass':'SaleCondition']
    ))
    
    print('Missing data before = ', all_data.isna().sum().sum())
    
    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data.drop(to_delete, axis = 1, inplace = True)
    train['SalePrice'] = np.log1p(train['SalePrice'])
    
    num_feats = all_data.dtypes[all_data.dtypes != 'object'].index
    skewed_feats = train[num_feats].apply(lambda x : skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    
    print('Missing data now = ', all_data.isna().sum().sum())
    
    X_train = all_data[:ntrain]
    X_test = all_data[ntrain:]
    y = train.SalePrice
    
    return X_train, X_test, y

In [24]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

In [25]:
Xtrain, Xtest, ytrain = data_preprocess(train, test)

(1460, 81)
Missing data before =  13756
Missing data now =  0


In [30]:
Xtrain.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,4.110874,4.189655,9.04204,7,5,2003,2003,5.283204,706.0,0.0,...,0,0,0,1,0,0,0,0,1,0
1,3.044522,4.394449,9.169623,6,8,1976,1976,0.0,978.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,4.110874,4.234107,9.328212,7,5,2001,2002,5.09375,486.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,4.26268,4.110874,9.164401,7,5,1915,1970,0.0,216.0,0.0,...,0,0,0,1,1,0,0,0,0,0
5,3.931826,4.454347,9.555064,5,5,1993,1995,0.0,732.0,0.0,...,0,0,0,1,0,0,0,0,1,0


In [33]:
def random_forest(Xtrain, Xtest, ytrain):
    X_train, y_train = Xtrain, ytrain
    
    rf_model = RandomForestRegressor(n_jobs=1,random_state=42)
    
    param_grid = {
        'n_estimators': [500],
        'max_features': [10,15],
        'max_depth':[3,5,7]
    }
    
    model = GridSearchCV(estimator = rf_model, param_grid=param_grid, n_jobs=10, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Random forest with GridSearch...')
    print('Best params : ', model.best_params_)
    print('Best CV Score:', model.best_score_)
    
    y_pred = model.predict(Xtest)
    return y_pred, model.best_score_
    

In [62]:
# Random forest
test_predict, score = random_forest(Xtrain, Xtest, ytrain)

Random forest with GridSearch...
Best params :  {'max_depth': 7, 'max_features': 15, 'n_estimators': 500}
Best CV Score: -0.15309353985230065


In [63]:
def create_submission(model_name, predictions):
    now = datetime.datetime.now()
    file_name = 'output/' + str(now.strftime("%Y-%m-%d-%H-%M")) + '_submission_' + model_name + '.csv'
    out = pd.DataFrame(
        {
            'Id' : test['Id'].values,
            'SalePrice' : np.exp(predictions)
        }
    )
    out.to_csv(file_name, index = False)
    print('File ', file_name, ' created in output/')

In [64]:
create_submission('random_forest', test_predict)

File  output/2021-02-07-18-24_submission_random_forest.csv  created in output/


In [65]:
def model_gradient_boosting_tree(Xtrain,Xtest,ytrain):
    
    X_train = Xtrain
    y_train = ytrain 
    gbr = GradientBoostingRegressor(random_state=0)
    param_grid = {
 #       'n_estimators': [500],
 #       'max_features': [10,15],
#	'max_depth': [6,8,10],
 #       'learning_rate': [0.05,0.1,0.15],
  #      'subsample': [0.8]
    }
    model = GridSearchCV(estimator=gbr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Gradient boosted tree regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_

In [66]:
test_predict,score = model_gradient_boosting_tree(Xtrain,Xtest,ytrain)

Gradient boosted tree regression...
Best Params:
{}
Best CV Score:
0.11299763387193003


In [67]:
create_submission('gradient_boosting', test_predict)

File  output/2021-02-07-18-26_submission_gradient_boosting.csv  created in output/


In [68]:
def model_xgb_regression(Xtrain,Xtest,ytrain):
    
    X_train = Xtrain
    y_train = ytrain 
    
    xgbreg = xgb.XGBRegressor(seed=0)
    param_grid = {
#        'n_estimators': [500],
#        'learning_rate': [ 0.05],
#        'max_depth': [ 7, 9, 11],
#        'subsample': [ 0.8],
#        'colsample_bytree': [0.75,0.8,0.85],
    }
    model = GridSearchCV(estimator=xgbreg, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('eXtreme Gradient Boosting regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_

In [69]:
test_predict,score = model_xgb_regression(Xtrain,Xtest,ytrain)

eXtreme Gradient Boosting regression...
Best Params:
{}
Best CV Score:
0.12643398832521577


In [70]:
create_submission('xgb_regression', test_predict)

File  output/2021-02-07-18-30_submission_xgb_regression.csv  created in output/


In [74]:
def model_extra_trees_regression(Xtrain,Xtest,ytrain):
    
    X_train = Xtrain
    y_train = ytrain
    
    etr = ExtraTreesRegressor(n_jobs=1, random_state=0)
    param_grid = {}#'n_estimators': [500], 'max_features': [10,15,20]}
    model = GridSearchCV(estimator=etr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Extra trees regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_

In [75]:
test_predict,score = model_extra_trees_regression(Xtrain,Xtest,ytrain)

Extra trees regression...
Best Params:
{}
Best CV Score:
0.13023272257362453


In [76]:
create_submission('extra_trees_regression', test_predict)

File  output/2021-02-07-18-33_submission_extra_trees_regression.csv  created in output/
