In [None]:
import pandas as pd

melbourne_file_path = '../data/melb_data.csv'

melbourne_data = pd.read_csv(melbourne_file_path)
print(melbourne_data.shape)
print(melbourne_data.columns)

In [None]:
melbourne_data = melbourne_data.dropna(axis=0)
print(melbourne_data.shape)
print(melbourne_data.columns)
y = melbourne_data.Price
print("y", y)

melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
X = melbourne_data[melbourne_features]
print('x description', X.describe())
print('x head', X.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor

# Define model. Specify a number for random_state to ensure same results each run
melbourne_model = DecisionTreeRegressor(random_state=1)

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Fit model
melbourne_model.fit(train_X, train_y)

In [None]:
from sklearn.metrics import mean_absolute_error


print("Making predictions for the following 5 houses:")
print(X.head())
print("The predictions are")
predicted_home_prices = melbourne_model.predict(val_X)
print("top value: ", val_y[0:5], predicted_home_prices[0:5])
mean_absolute_error(val_y, predicted_home_prices)

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error

path = '../data/melb_data.csv'
_data = pd.read_csv(path)
print("shape", _data.shape)
print("column", _data.columns)

_feature = ["Rooms", "Bathroom", "Landsize", "Lattitude", "Longtitude"]
X = _data[_feature]
print("X columns description", X.columns)
y = _data.Price

train_x, val_x, train_y, val_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

model = DecisionTreeRegressor(random_state=1)
cols_with_missing = [col for col in train_x.columns
                     if train_x[col].isnull().any()]

for col in cols_with_missing:
    train_x[col + '_was_missing'] = train_x[col].isnull()
    val_x[col + '_was_missing'] = val_x[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(train_x))
imputed_X_valid = pd.DataFrame(my_imputer.transform(val_x))

model.fit(imputed_X_train, train_y)
prediction = model.predict(imputed_X_valid)

print("evaluation: ", mean_absolute_error(val_y, prediction))


In [None]:
train_x, val_x, train_y, val_y = train_test_split(_data, y, train_size=0.8, test_size=0.2, random_state=0)

low_cardinality_cols = [cname for cname in train_x.columns if train_x[cname].nunique() < 10 and 
                        train_x[cname].dtype == "object"]

print('low_cardinality_cols', low_cardinality_cols)

numerical_cols = [cname for cname in train_x.columns if train_x[cname].dtype in ['int64', 'float64']]

print('numerical_cols', numerical_cols)

# Keep selected columns only
my_cols = low_cardinality_cols + numerical_cols
print('my_cols', my_cols)
X_train = train_x[my_cols].copy()
X_valid = val_x[my_cols].copy()

print("----------")
print(_data.head())
print('----------')

s = (X_train.dtypes == 'object')
print('ssss', s)
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [None]:
# Score from Approach 1 (Drop Categorical Variables)

drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, train_y, val_y))

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])

print("MAE from Approach 2 (Ordinal Encoding):") 
print(score_dataset(label_X_train, label_X_valid, train_y, val_y))

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_X_train, OH_X_valid, train_y, val_y))