In [80]:
import pandas as pd

melbourne_file_path = "melb_data.csv"
melbourne_data = pd.read_csv(melbourne_file_path)
#melbourne_data.describe()

In [81]:
 melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [82]:
y = melbourne_data.Price

In [83]:
melbourne_features = ['Rooms', 'Bathroom','Landsize','Lattitude','Longtitude']

In [84]:
X = melbourne_data[melbourne_features]
#X.head()

In [85]:
from sklearn.tree import DecisionTreeRegressor

#define model and specify no of random state to ensure same results
melbourne_model = DecisionTreeRegressor(random_state=1)

#fit model
melbourne_model.fit(X,y)

DecisionTreeRegressor(random_state=1)

In [86]:
#make predictions
print('making predictions for the following five houses')
print(X.head())

print("The predictions are...")
print(melbourne_model.predict(X.head()))

# trying to measure the accuracy of our model the less smart way
# y.head()

making predictions for the following five houses
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
0      2       1.0     202.0   -37.7996    144.9984
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
3      3       2.0      94.0   -37.7969    144.9969
4      4       1.0     120.0   -37.8072    144.9941
The predictions are...
[1480000. 1035000. 1465000.  850000. 1600000.]


In [87]:
# evaluating the quality of the model using Mean Absolute Error
from sklearn.metrics import mean_absolute_error

predicted_home_prices = melbourne_model.predict(X)
mean_absolute_error(y, predicted_home_prices)

1125.1804614629357

In [88]:
from sklearn.model_selection import train_test_split

# split data into training and validation data, for both features and target
# The split is based on a random number generator. Supplying a numeric value to
# the random_state argument guarantees we get the same split every time we
# run this script.
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

# Define model
melbourne_model = DecisionTreeRegressor()

# Fit model
melbourne_model.fit(train_X, train_y)

# get predicted prices on validation data
val_predictions = melbourne_model.predict(val_X)
print(mean_absolute_error(val_y, val_predictions))
print(val_predictions)

247230.40157093765
[1325000.  955000.  571000. ... 1400000.  819000. 1285000.]


In [89]:
from sklearn.ensemble import RandomForestRegressor

melbourne_model_2 = RandomForestRegressor(random_state=1)
melbourne_model_2.fit(train_X, train_y)

#predict the house prices on the validation data
val_predictions_2 = melbourne_model_2.predict(val_X)
print(mean_absolute_error(val_y, val_predictions_2))

180860.37877504269


In [90]:
# define a function that builds a model and makes predictions
    
from sklearn.metrics import mean_absolute_error

def Scoreall(train_X, val_X, train_y, val_y):
    model = RandomForestRegressor(n_estimators=10,random_state=1)
    model.fit(train_X, train_y)
    preds = model.predict(val_X)
    return mean_absolute_error(val_y,preds)

Scoreall(train_X, val_X, train_y, val_y)



191525.59192369733

In [91]:
# get columns with missing data
cols_with_missing = [col for col in train_X.columns
                    if train_X[col].isnull().any()]
#drop columns in training and validation dat

reduced_X_train = train_X.drop(cols_with_missing,axis=1)
reduced_X_val = val_X.drop(cols_with_missing,axis=1)

print("MAE from approache 1(drop columns with missing data)")
print(Scoreall(reduced_X_train, reduced_X_val, train_y, val_y))

MAE from approache 1(drop columns with missing data)
191525.59192369733


In [94]:
# build a pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#preprocessing for missing numerical data
numerical_transformer = SimpleImputer(strategy='constant')

#preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#columns with missing data
numerical_cols = [col for col in train_X.columns
                 if train_X[col].dtype in ['int64', 'float64']]

#colums with categorical data
categorical_cols = [col for col in train_X.columns
                   if train_X[col].nunique() < 10 and 
                   train_X[col].dtype == "object"]

#Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


#define the model. we'd be using randomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

model = RandomForestRegressor(n_estimators=100 , random_state=0)

#bundle preprocessing and modelling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('model', model)])

#preprocessing of training data fit model
my_pipeline.fit(train_X, train_y)

#preprocessing of validation data and get predictions
preds = my_pipeline.predict(val_X)

#Evaluate the model
score = mean_absolute_error(val_y,preds)
print('MAE:', score)

MAE: 182369.6504033476


In [None]:
# save output to a csv file

output = pd.DataFrame({'Id':val_X.index, 'SalePrice': val_predictions_2})
output.to_csv('submission.csv', index=False)