In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
url = 'https://raw.githubusercontent.com/mirokr/ml/main/melb_data.csv'


In [None]:
data = pd.read_csv(url)

In [None]:
y = data.Price
melb_predict = data.drop(['Price'], axis=1)

In [None]:
X = melb_predict.select_dtypes(exclude=['object'])

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size = 0.2, random_state=0) 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [None]:
def score_data(X_train,X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=15, random_state=0)
  model.fit(X_train, y_train)
  pred = model.predict(X_valid)
  return mean_absolute_error(y_valid, pred)

In [None]:
columns_with_missing = [column for column in X_train.columns if X_train[column].isnull().any()]

In [None]:
clean_X_train = X_train.drop(columns_with_missing, axis=1)
clean_X_valid = X_valid.drop(columns_with_missing, axis=1)
print('MAE from Removing Columns Method:')
print(score_data(clean_X_train, clean_X_valid, y_train, y_valid))

MAE from Removing Columns Method:
181310.27503448114


In [None]:
#imputation

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(imputer.fit_transform(X_valid))
print('MAE using imputation')
print(score_data(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE using imputation
176285.68010145638


In [None]:
#extended imputation

In [None]:
X_train_ext = X_train.copy()
X_valid_ext = X_valid.copy()

In [None]:
for column in columns_with_missing:
  X_train_ext[column + '_was_missing'] = X_train_ext[column].isnull()
  X_valid_ext[column + '_was_missing'] = X_valid_ext[column].isnull()

In [None]:
imputer = SimpleImputer()

In [None]:
imputed_X_train_ext = pd.DataFrame(imputer.fit_transform(X_train_ext))
imputed_X_valid_ext = pd.DataFrame(imputer.fit_transform(X_valid_ext))

In [None]:
imputed_X_train_ext.columns = X_train_ext.columns
imputed_X_valid_ext.columns = X_valid_ext.columns
print('MAE with extended imputatiom')
print(score_data(imputed_X_train_ext, imputed_X_valid_ext, y_train, y_valid))

MAE with extended imputatiom
176134.6387778479


In [1]:
#categorical

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
url = 'https://raw.githubusercontent.com/mirokr/ml/main/melb_data.csv'
data = pd.read_csv(url)

In [5]:
y = data.Price
X = data.drop (['Price'], axis=1)

In [7]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=0)

In [8]:
#droping columns with missing values
cols_with_missing = [col for col in X_train_full.columns if X_train_full[col].isnull().any()]
X_train_full.drop(cols_with_missing, axis=1, inplace= True)
X_valid_full.drop(cols_with_missing, axis=1, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [9]:
low_card_columns = [colname for colname in X_train_full.columns  if X_train_full[colname].nunique() < 15 and X_train_full[colname].dtype == 'object']

In [11]:
num_cols = [colname for colname in X_train_full.columns if X_train_full[colname].dtype in ['int32', 'int64', 'float64']]

In [12]:
m_columns = low_card_columns + num_cols
X_train = X_train_full[m_columns].copy()
X_valid = X_valid_full[m_columns].copy()

In [13]:
X_train.head()

Unnamed: 0,Type,Method,Regionname,Rooms,Distance,Postcode,Bedroom2,Bathroom,Landsize,Lattitude,Longtitude,Propertycount
664,h,S,Southern Metropolitan,3,9.2,3104.0,3.0,2.0,368.0,-37.7846,145.0935,7809.0
3270,h,S,Eastern Metropolitan,2,10.5,3081.0,2.0,1.0,586.0,-37.7435,145.0486,2947.0
3873,h,S,Southern Metropolitan,2,11.2,3145.0,2.0,1.0,348.0,-37.8672,145.0432,8801.0
13170,h,S,Northern Metropolitan,3,19.6,3076.0,3.0,1.0,521.0,-37.63854,145.05179,10926.0
1730,h,S,Southern Metropolitan,4,11.4,3163.0,3.0,2.0,687.0,-37.8931,145.0479,7822.0


In [14]:
#list of categorical 
l = (X_train.dtypes == 'object')
object_col = list(l[l].index)

print('Categorical variables:')
print(object_col)

Categorical variables:
['Type', 'Method', 'Regionname']


In [15]:
#check quality

In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [17]:
def score_data(X_train, X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=110, random_state=0)
  model.fit(X_train, y_train)
  pred = model.predict(X_valid)
  return mean_absolute_error(y_valid, pred)


In [18]:
drop_X_train = X_train.select_dtypes(exclude=['object'])
drop_X_valid = X_valid.select_dtypes(exclude=['object'])

print('MAE when droping categorical:')
print(score_data(drop_X_train, drop_X_valid, y_train, y_valid))

MAE when droping categorical:
179228.6127520524


In [19]:
#ordinal encoding

In [20]:
from sklearn.preprocessing import OrdinalEncoder

In [26]:
lab_X_train = X_train.copy()
lab_X_valid = X_valid.copy()

ordinal_encoder = OrdinalEncoder()
lab_X_train[object_col] = ordinal_encoder.fit_transform(X_train[object_col])
lab_X_valid[object_col] = ordinal_encoder.transform(X_valid[object_col])

print('MAE with ordinal encoding')
print(score_data(lab_X_train, lab_X_valid, y_train, y_valid))

MAE with ordinal encoding
169992.70913043755


In [22]:
#hot encoder

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [25]:
oh_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
oh_cols_train = pd.DataFrame(oh_encoder.fit_transform(X_train[object_col]))
oh_cols_valid = pd.DataFrame(oh_encoder.transform(X_valid[object_col]))

#put back index
oh_cols_train.index = X_train.index
oh_cols_valid.index = X_valid.index

In [28]:
#remove categorical
n_X_train = X_train.drop(object_col, axis=1)
n_X_valid = X_valid.drop(object_col, axis=1)
#add oh encoded columns
oh_X_train = pd.concat([n_X_train, oh_cols_train], axis=1)
oh_X_valid = pd.concat([n_X_valid, oh_cols_valid], axis=1)

print('MAE with OH Encoding:')
print(score_data(oh_X_train, oh_X_valid, y_train, y_valid))

MAE with OH Encoding:
168987.16114313767


In [29]:
#pipelines