In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
housing = train.drop("SalePrice",axis=1)
housing_labels = train["SalePrice"].copy()

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

In [5]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

housing_num_cols = housing.select_dtypes(include=np.number)
housing_cat_cols = housing.select_dtypes(exclude=np.number).astype('category')

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(housing_num_cols))),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler',StandardScaler()),
])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(housing_cat_cols))),
        ('imputer',SimpleImputer(strategy="constant",fill_value='none')),
        ('oneHot', OneHotEncoder(handle_unknown='ignore')),
])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
])

In [6]:
housing_prepared = full_pipeline.fit_transform(housing)

In [7]:
X = housing_prepared
y = housing_labels

train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=1)

rf_model = RandomForestRegressor(n_estimators=100, random_state=1)
rf_model.fit(train_X,train_y)

RandomForestRegressor(random_state=1)

In [8]:
test_prepared = full_pipeline.transform(test)

In [9]:
cross_val_score(rf_model, train_X, train_y)

array([0.80960929, 0.83386384, 0.80485253, 0.80183762, 0.89097111])

In [13]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(train_X,train_y)
base_accuracy = evaluate(base_model, train_X,train_y)

Model Performance
Average Error: 8153.4712 degrees.
Accuracy = 95.35%.


In [14]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, train_X,train_y)

Model Performance
Average Error: 3433.2030 degrees.
Accuracy = 98.02%.


In [21]:
from sklearn.ensemble import GradientBoostingRegressor

est = GradientBoostingRegressor(n_estimators=1000,learning_rate=0.05).fit(train_X,train_y)


In [23]:
gb_accuracy = evaluate(est, train_X,train_y)

Model Performance
Average Error: 4318.3256 degrees.
Accuracy = 97.34%.


In [22]:
val_predictions = est.predict(test_prepared)
test['SalePrice'] = val_predictions

test[['Id','SalePrice']].to_csv('submission.csv',index=False)