In [44]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [26]:
houses = pd.read_csv('melb_data.csv')
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [31]:
y = houses.Price
x = houses.drop('Price', axis=1)

x_train, x_valid, y_train, y_valid = train_test_split(x, y, random_state=1)

num_columns = [col for col in x_train.columns if x_train[col].dtype in ['int64', 'float64']]
cat_columns = [col for col in x_train.columns if x_train[col].dtype == 'object']

In [40]:
# build a pipeline with ColumnTransform

numerical_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transform = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

pre_processor = ColumnTransformer([
    ('num', numerical_transform, num_columns),
    ('cat', categorical_transform, cat_columns),
])  # bundle with different pipelines 

pipeline = Pipeline([
    ('preprocessor', pre_processor),
    ('classifier', RandomForestRegressor())
])

In [41]:
pipeline.fit(x_train, y_train)

In [42]:
pipeline.predict(x_valid)  # default predict

array([1743326.,  787616., 2300340., ..., 1433655., 1289115.,  518187.])

In [45]:
# testando com cross validate
# não precisamos separar os dados em treino e teste

y = houses.Price
x = houses.drop('Price', axis=1)

scores = cross_val_score(pipeline, x, y, cv=5, scoring='neg_mean_absolute_error')

In [47]:
print(scores * -1)

[201175.79964286 183345.01949558 181298.7529676  151695.04337629
 157507.15084315]
