In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
housing = train.drop("SalePrice",axis=1)
housing_labels = train["SalePrice"].copy()

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return X[self.attribute_names].values

In [6]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

housing_num_cols = housing.select_dtypes(include=np.number)
housing_cat_cols = housing.select_dtypes(exclude=np.number).astype('category')

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(housing_num_cols))),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler',StandardScaler()),
])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(list(housing_cat_cols))),
        ('imputer',SimpleImputer(strategy="constant",fill_value='none')),
        ('oneHot', OneHotEncoder(handle_unknown='ignore')),
])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
])

In [7]:
housing_prepared = full_pipeline.fit_transform(housing)

In [8]:
housing_prepared.shape

(1460, 305)

In [9]:
X = housing_prepared
y = housing_labels

train_X,val_X,train_y,val_y = train_test_split(X,y,random_state=1)

rf_model = RandomForestRegressor(n_estimators=100, random_state=1)
rf_model.fit(train_X,train_y)

RandomForestRegressor(random_state=1)

In [10]:
test_prepared = full_pipeline.transform(test)

In [11]:
final_model = RandomForestRegressor()
final_model.fit(housing_prepared, housing_labels)

val_predictions = final_model.predict(test_prepared)
test['SalePrice'] = val_predictions

test[['Id','SalePrice']].to_csv('submission.csv',index=False)