In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
X_full = pd.read_csv('train.csv')
X_full_test = pd.read_csv('test.csv')

X_full.dropna(subset=['SalePrice'], axis=0, inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

X_train_full, X_val_full, y_train, y_val = train_test_split(X_full, y, train_size=0.8, test_size=0.2, random_state=0)

categorical_cols = [col for col in X_train_full.columns if
                   X_train_full[col].dtype == 'object' and
                   X_train_full[col].nunique() < 10]
numerical_cols = [col for col in X_train_full.columns if
                 X_train_full[col].dtype in ['float64', 'int64']]

my_cols = categorical_cols + numerical_cols

X_train = X_train_full[my_cols].copy()
X_val = X_val_full[my_cols].copy()
X_test = X_full_test[my_cols].copy()

In [5]:
X_train.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Norm,Norm,...,774,0,108,0,0,260,0,0,7,2007
870,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,PosN,Norm,...,308,0,0,0,0,0,0,0,8,2009
92,RL,Pave,Grvl,IR1,HLS,AllPub,Inside,Gtl,Norm,Norm,...,432,0,0,44,0,0,0,0,8,2009
817,RL,Pave,,IR1,Lvl,AllPub,CulDSac,Gtl,Norm,Norm,...,857,150,59,0,0,0,0,0,7,2008
302,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Norm,Norm,...,843,468,81,0,0,0,0,0,1,2006


In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [7]:
numerical_transformer = SimpleImputer(strategy='constant')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
#Preprocessing using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [8]:
my_model = RandomForestRegressor(n_estimators=100, random_state=0)

#Bundling Preprocessing and model in Pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                     ('model', my_model)])
clf.fit(X_train, y_train)
val_preds = clf.predict(X_val)

print("MAE: ")
print(mean_absolute_error(y_val, val_preds))

MAE: 
17769.80962328767


In [9]:
test_preds = clf.predict(X_test)

output = pd.DataFrame({'Id': X_test.index,
                      'SalePrice': test_preds})
output.to_csv('Pipeline-submission.csv', index=False)