In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [3]:
train_data = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv')

X = train_data.drop('SalePrice', axis=1)
y = train_data.SalePrice

object_cols = [col for col in X.columns if X[col].dtype=='object']
numerical_cols = list(set(X.columns) - set(object_cols))

In [16]:
ordinal = OrdinalEncoder()
X[object_cols] = pd.DataFrame(ordinal.fit_transform(X[object_cols]))

imputer_mean = SimpleImputer()
imputer_mode = SimpleImputer(strategy='most_frequent')

preprocessor = ColumnTransformer(transformers=[('num', imputer_mean, numerical_cols),
                                               ('cat', imputer_mode, object_cols)])

model = RandomForestRegressor()

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])


scores = -1 * cross_val_score(my_pipeline, X, y, cv=5, scoring='neg_mean_absolute_error')
print(scores.mean())


17639.799424657533
