In [1]:
import pandas as pd

In [3]:
data_filepath = '/home/oktavianu/my-Ai/ML-basics/melb_data.csv'
data = pd.read_csv(data_filepath)

In [4]:
# Select Target 
y = data.Price

In [5]:
# Use only numerical predictors to make things simple
melb_predictors = data.drop(['Price'], axis=1) # drop or exclude 'Price' from our predictors
X = melb_predictors.select_dtypes(exclude=['object']) # exlude object so that X will only contains column with numbers as data types


In [6]:
from sklearn.model_selection import train_test_split

# Divide data into training and testing or validations subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Define function to measure each approach 
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(preds, y_valid)

In [18]:
# Obtain score mae from approach 1:

# get columns' names with missing values
cols_with_mv = [col for col in X_train.columns if X_train[col].isnull().any()]
cols_with_mv

['Car', 'BuildingArea', 'YearBuilt']

In [20]:
# Now we already have the columns with msiing values which are ['Car', 'BuildingArea', 'YearBuilt']
# Drop columns in training and validation data with missing values
reduced_X_train = X_train.drop(cols_with_mv, axis=1)
reduced_X_valid = X_valid.drop(cols_with_mv, axis=1) 

#print("X before missing values dropped:")
#X_train.head()
#print("X after dropping mssing values")
#reduced_X_train.head()

print("MAE from approach 1 (Drop columns with mssing values: ")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from approach 1 (Drop columns with mssing values: 
186029.25597692686


In [23]:
# score from approach 2, imputation
# We will use SimpleImputer to to replace mssing values with the mean along each column.
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# Imputation removes columns name, we need to put 'em back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print('MAE from approach 2 (Imputation):')
print(score_dataset(X_train, X_valid, y_train, y_valid))



MAE from approach 2 (Imputation):
178605.2755768287


In [None]:
# Score from approach 3 (extension of imputation):

# make a copy to avoid 