In [71]:
import pandas as pd

melbourne_file_path = "melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
#melbourne_data.describe()

In [72]:
 melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [73]:
y = melbourne_data.Price

In [74]:
melbourne_features = ['Rooms', 'Bathroom','Landsize','Lattitude','Longtitude']

In [75]:
X = melbourne_data[melbourne_features]
#X.head()

In [76]:
from sklearn.tree import DecisionTreeRegressor

#define model and specify no of random state to ensure same results
melbourne_model = DecisionTreeRegressor(random_state=1)

#fit model
melbourne_model.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [77]:
#make predictions
print('making predictions for the following five houses')
print(X.head())

print("The predictions are...")
print(melbourne_model.predict(X.head()))

# trying to measure the accuracy of our model the less smart way
# y.head()

making predictions for the following five houses
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941
The predictions are...
[1480000. 1035000. 1465000.  850000. 1600000.]


In [78]:
# evaluating the quality of the model using Mean Absolute Error
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1125.1804614629357

In [79]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))
print(val_predictions)

250777.73235149728
[1325000.  950000.  571000. ... 1400000.  819000. 1285000.]


In [80]:
from sklearn.ensemble import RandomForestRegressor

melbourne_model_2 = RandomForestRegressor(random_state=1)
melbourne_model_2.fit(train_X, train_y)

#predict the house prices on the validation data
val_predictions_2 = melbourne_model_2.predict(val_X)
print(mean_absolute_error(val_y, val_predictions_2))

180860.37877504269


In [81]:
# define a function that builds a model and makes predictions
    
from sklearn.metrics import mean_absolute_error

def Scoreall(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=10,random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y,preds)

Scoreall(train_X, val_X, train_y, val_y)



191525.59192369733

In [82]:
# get columns with missing data
cols_with_missing = [col for col in train_X.columns
                    if train_X[col].isnull().any()]
#drop columns in training and validation dat

reduced_X_train = train_X.drop(cols_with_missing,axis=1)
reduced_X_val = val_X.drop(cols_with_missing,axis=1)

print("MAE from approache 1(drop columns with missing data)")
print(Scoreall(reduced_X_train, reduced_X_val, train_y, val_y))

MAE from approache 1(drop columns with missing data)
191525.59192369733


In [83]:
# build a pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#preprocessing for missing numerical data
numerical_transformer = SimpleImputer(strategy='constant')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#columns with numerical data
numerical_cols = [col for col in train_X.columns
                 if train_X[col].dtype in ['int64', 'float64']]

#colums with categorical data
categorical_cols = [col for col in train_X.columns
                   if train_X[col].nunique() < 10 and 
                   train_X[col].dtype == "object"]

#Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


#define the model. we'd be using randomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100 , random_state=0)

#bundle preprocessing and modelling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])

#preprocessing of training data fit model
my_pipeline.fit(train_X, train_y)

#preprocessing of validation data and get predictions
preds = my_pipeline.predict(val_X)

#Evaluate the model
score = mean_absolute_error(val_y,preds)
print('MAE:', score)

MAE: 182369.6504033476


In [84]:
# save output to a csv file

output = pd.DataFrame({'Id':val_X.index, 'SalePrice': val_predictions_2})
output.to_csv('submission.csv', index=False)

In [85]:
# using cross validation to better measure the quality of our mode

X.describe()

Unnamed: 0,Rooms,Bathroom,Landsize,Lattitude,Longtitude
count,13580.0,13580.0,13580.0,13580.0,13580.0
mean,2.937997,1.534242,558.416127,-37.809203,144.995216
std,0.955748,0.691712,3990.669241,0.07926,0.103916
min,1.0,0.0,0.0,-38.18255,144.43181
25%,2.0,1.0,177.0,-37.856822,144.9296
50%,3.0,1.0,440.0,-37.802355,145.0001
75%,3.0,2.0,651.0,-37.7564,145.058305
max,10.0,8.0,433014.0,-37.40853,145.52635


In [86]:
y.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [87]:
# build a pipeline to preprocess and model the data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

#determine the numerical/categorical  columns
numerical_cols = [col for col in X.columns
                 if X[col].dtype in ['int64', 'float64']]
categorical_cols =[col for col in X.columns
                  if X[col].nunique() < 10 and
                  X[col].dtype == 'object']

# preprocessing numerical transformer
numerical_transformer = SimpleImputer(strategy='constant')

#preprocessisng categorical data
categorical_transformer = Pipeline( steps = [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#bundling the numerical and categorical preprocessing
preprocessor = ColumnTransformer( transformers = [
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

#define our model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# create pipeline
my_pipeline = Pipeline(steps =[
    ('preprocessor', preprocessor),
    ('model', model)
])


# obtain cross-validation scores 

from sklearn.model_selection import cross_val_score

#multiply by -1 since sklearn calculates *negative* MAE

scores = -1 * cross_val_score(my_pipeline,X,y, cv=5, scoring= 'neg_mean_absolute_error')

print("MAE scores: \n", scores)


MAE scores: 
 [220173.00417302 206653.76878901 207300.40187557 172659.62595651
 168304.71075373]


In [88]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
195018.30230956824


In [89]:
# test our model on 8 different trees for random forest regressor
# 50, 100, 150, 200, 250, 300, 350, 400

def get_score(n_estimators):
    my_pipeline = Pipeline(steps= [
        ('preprocessor', SimpleImputer()),
        ('model', RandomForestRegressor(n_estimators , random_state=0))
    ])
    scores = -1 * cross_val_score(my_pipeline,X,y, cv=3, scoring= 'neg_mean_absolute_error')
    return scores.mean()
    
results = {}
for i in range(1, 9):
    results[50*i] = get_score(50*i)
    
print(results)

{50: 209246.14944824608, 100: 207274.41900201212, 150: 206597.2181800314, 200: 206476.26672387394, 250: 206175.262666745, 300: 206120.82258765426, 350: 206295.23540777425, 400: 206366.79827257417}
