In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
url = 'https://raw.githubusercontent.com/mirokr/ml/main/melb_data.csv'


In [3]:
data = pd.read_csv(url)

In [4]:
y = data.Price
X = data.drop(['Price'], axis=1)

In [9]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.85, test_size=0.15, random_state=0)

In [10]:
categorical_col = [colname for colname in X_train_full.columns if X_train_full[colname].nunique() < 15 and X_train_full[colname].dtype == 'object']
num_col = [colname for colname in X_train_full.columns if X_train_full[colname].dtype in ['int32', 'int64', 'float64']]

In [11]:
m_cols = categorical_col + num_col
X_train = X_train_full[m_cols].copy()
X_valid = X_valid_full[m_cols].copy()

In [12]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
13218,h,VB,Southern Metropolitan,3,5.4,3101.0,3.0,2.0,1.0,275.0,,1992.0,-37.80442,145.04085,10331.0
42,t,S,Western Metropolitan,3,13.5,3042.0,3.0,2.0,2.0,239.0,134.0,2009.0,-37.7218,144.8837,3464.0
2162,t,S,Eastern Metropolitan,4,13.9,3108.0,4.0,2.0,2.0,182.0,160.0,1998.0,-37.7888,145.138,9028.0
6125,u,S,Southern Metropolitan,3,11.2,3127.0,3.0,1.0,2.0,195.0,,,-37.8287,145.1022,5457.0
9981,h,SP,Western Metropolitan,3,6.2,3015.0,3.0,1.0,2.0,492.0,103.0,1940.0,-37.83529,144.87893,5498.0


In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

In [14]:
num_transformer = SimpleImputer(strategy='constant')

cat_transformer = Pipeline(steps=[
                                  ('imputer', SimpleImputer(strategy='most_frequent')),
                                  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [15]:
preprocessor = ColumnTransformer(
    transformers=[
                  ('num', num_transformer, num_col),
                  ('cat', cat_transformer, categorical_col)
    ]
)

In [16]:
#model defined

In [17]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=110, random_state=0)

In [18]:
from sklearn.metrics import mean_absolute_error

In [19]:
m_pipeline = Pipeline(steps=[
                             ('preprocessor', preprocessor),
                             ('model', model)
])

In [20]:
m_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='constant',
                                                                verbose=0),
                                                  ['Rooms', 'Distance',
                                                   'Postcode', 'Bedroom2',
                                                   'Bathroom', 'Car',
 

In [21]:
predictions = m_pipeline.predict(X_valid)

In [23]:
score = mean_absolute_error(y_valid, predictions)
print('MAE score:', score)

MAE score: 157696.1482508655
