In [96]:
import pandas as pd

In [212]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
y_train = train['SalePrice']
x_train = train.drop(['SalePrice'],axis=1)

Replace Nan with most frequent value of the feature for train and test

In [213]:
import numpy as np
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy="most_frequent")
x_train = imp.fit_transform(x_train)
test_id = test['Id']
test = imp.fit_transform(test)

### Applied ordinal encoder on train and test


In [194]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
onehot = OneHotEncoder()
ordinal = OrdinalEncoder()
hotVtrain = ordinal.fit_transform(x_train)
hotVtest = ordinal.fit_transform(test)

### Applied Random Forest Regression

In [195]:
from sklearn.ensemble import RandomForestRegressor

In [196]:
rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(hotVtrain, hotVtest)

RandomForestRegressor(n_estimators=1000, random_state=42)

### Saved to CSV
and Converted the numpy hotVtest to pandas dataframe

In [150]:
predicted = rf.predict(hotVtest)
#np.savetxt("output.csv",predicted_test,delimiter=",", header="Id, SalePrice")
predicted_pandas = pd.DataFrame(predicted)

frames = [test_id, predicted_pandas]
result = pd.concat(frames, axis=1)
result.columns = ['Id','SalePrice']
result.to_csv("output.csv",index=False)

### r2 Score is good
the Kaggle score is 0.149

In [132]:
predict_train = rf.predict(hotVtrain)
from sklearn import metrics
metrics.r2_score(predict_train, y_train)

0.9796857833247454

## Parameters Tuning

In [161]:
from pprint import pprint
#pprint(rf.get_params())
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [164]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
from sklearn.model_selection import RandomizedSearchCV
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(hotVtrain,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [168]:
rf_random.best_params_


{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

In [169]:
# Output to CSV again
predictedRandom = rf_random.predict(hotVtest)
#np.savetxt("output.csv",predicted_test,delimiter=",", header="Id, SalePrice")
predicted_pandas = pd.DataFrame(predictedRandom)

frames = [test_id, predicted_pandas]
result = pd.concat(frames, axis=1)
result.columns = ['Id','SalePrice']
result.to_csv("output2.csv",index=False)

### R2 Score is very Good!
the Kaggle score is 0.148

In [171]:
predict_train = rf_random.predict(hotVtrain)
from sklearn import metrics
metrics.r2_score(predict_train, y_train)

0.9999999993388834

### XGBoost

In [197]:
from xgboost import XGBRegressor
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from numpy import absolute
# define model
xgbmodel = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(xgbmodel, hotVtrain, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()) )

Mean MAE: 17307.632 (1705.613)


In [198]:
xgbmodel.fit(hotVtrain,y_train)
predicted_xgb = xgbmodel.predict(hotVtest)

In [199]:
# Output to CSV again
predicted_pandas = pd.DataFrame(predicted_xgb)

frames = [test_id, predicted_pandas]
result = pd.concat(frames, axis=1)
result.columns = ['Id','SalePrice']
result.to_csv("output3.csv",index=False)

Kaggle score is 0.147