In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [45]:
url = 'https://raw.githubusercontent.com/mirokr/ml/main/melb_data.csv'


In [46]:
data = pd.read_csv(url)

In [48]:
y = data.Price
melb_predict = data.drop(['Price'], axis=1)

In [49]:
X = melb_predict.select_dtypes(exclude=['object'])

In [50]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size = 0.2, random_state=0) 

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [52]:
def score_data(X_train,X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=15, random_state=0)
  model.fit(X_train, y_train)
  pred = model.predict(X_valid)
  return mean_absolute_error(y_valid, pred)

In [54]:
columns_with_missing = [column for column in X_train.columns if X_train[column].isnull().any()]

In [56]:
clean_X_train = X_train.drop(columns_with_missing, axis=1)
clean_X_valid = X_valid.drop(columns_with_missing, axis=1)
print('MAE from Removing Columns Method:')
print(score_data(clean_X_train, clean_X_valid, y_train, y_valid))

MAE from Removing Columns Method:
181310.27503448114


In [None]:
#imputation

In [57]:
from sklearn.impute import SimpleImputer

In [58]:
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.fit_transform(X_valid))
print('MAE using imputation')
print(score_data(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE using imputation
176285.68010145638


In [59]:
#extended imputation

In [60]:
X_train_ext = X_train.copy()
X_valid_ext = X_valid.copy()

In [61]:
for column in columns_with_missing:
  X_train_ext[column + '_was_missing'] = X_train_ext[column].isnull()
  X_valid_ext[column + '_was_missing'] = X_valid_ext[column].isnull()

In [62]:
imputer = SimpleImputer()

In [64]:
imputed_X_train_ext = pd.DataFrame(imputer.fit_transform(X_train_ext))
imputed_X_valid_ext = pd.DataFrame(imputer.fit_transform(X_valid_ext))

In [66]:
imputed_X_train_ext.columns = X_train_ext.columns
imputed_X_valid_ext.columns = X_valid_ext.columns
print('MAE with extended imputatiom')
print(score_data(imputed_X_train_ext, imputed_X_valid_ext, y_train, y_valid))

MAE with extended imputatiom
176134.6387778479
