In [10]:
# importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

In [14]:
# reading data
melbourne_data = pd.read_csv('Data/melb_data.csv')

In [13]:
# examining data
melbourne_data.head()
melbourne_data.shape

(13580, 21)

In [15]:
# setting 'Price' column as target for predctions
y = melbourne_data.Price

# copy of original dataset and features preparation
melb_data = melbourne_data.copy()
features = melb_data.drop(['Price'], axis=1) # remove 'Price' column

# only numerical predicators
X = features.select_dtypes(exclude=['object'])

# divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [16]:
# function for comparing different approaches in missing values

def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=10)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [17]:
# approach 1 - drop columns with missing values

def drop_columns(X_train, X_valid):
    # retrieve names of columns with missing values
    cols_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]

    #drop columns from dataset | axis=0 - row, axis=1 - column
    reduced_X_train = X_train.drop(cols_with_missing_values, axis=1)
    reduced_X_valid = X_valid.drop(cols_with_missing_values, axis=1)
    
    return [reduced_X_train, reduced_X_valid]

In [18]:
# scoring of 1st approach

# drop columns with missing values
reduced_X_train, reduced_X_valid = drop_columns(X_train, X_valid)

# predict and evaluating MAE
mae = score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid)
mae

181715.7551528859

In [None]:
# approach 2 - replacing missing values with the mean value along each column

def imputation_dataset(X_train, X_valid):
    # define imputer
    imputer = SimpleImputer()

    # transform datasets
    imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(imputer.transform(X_valid))
    
    # add columns' names
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    
    return [imputed_X_train, imputed_X_valid]