In [8]:
import pandas as pd
import numpy as np

from autogluon.tabular import TabularDataset, TabularPredictor

melb_data = pd.read_csv('melb_data.csv')

In [9]:
# # Getting a training and validation dataset
from sklearn.model_selection import train_test_split
X_train, X_valid = train_test_split(melb_data, test_size=0.2, random_state=0)
print("Number of training samples:", len(X_train))
print("Number of test samples:", len(X_valid))

Number of training samples: 10864
Number of test samples: 2716


In [10]:
# Importing the dataset
train_data = TabularDataset(X_train)

# Defining a sample of the data
subsample_size = 1000
train_data = train_data.sample(n=subsample_size, random_state=0)
print(train_data.head())

label = 'Price'
print("Summary of the label: ", train_data[label].describe())

               Suburb          Address  Rooms Type      Price Method  \
13374  Brunswick East    117 Barkly St      2    h   975000.0      S   
6613        Yallambie  36 Longacres Rd      4    h   953000.0      S   
10831      Strathmore  1/351 Napier St      3    u   750000.0     VB   
8204        Parkville     21 Morrah St      4    h  3970000.0      S   
5485           Seddon       38 Lily St      3    h  1325000.0     VB   

         SellerG        Date  Distance  Postcode  ...  Bathroom  Car  \
13374     Nelson  26/08/2017       4.0    3057.0  ...       1.0  1.0   
6613   Fletchers  15/10/2016      15.0    3085.0  ...       2.0  2.0   
10831     Nelson   8/07/2017       8.2    3041.0  ...       2.0  2.0   
8204     Collins  13/05/2017       2.6    3052.0  ...       3.0  1.0   
5485         Jas   4/06/2016       6.6    3011.0  ...       3.0  2.0   

       Landsize  BuildingArea  YearBuilt    CouncilArea Lattitude  Longtitude  \
13374     159.0           NaN        NaN            N

In [11]:
# Model Parameters
time_limit = 60
metric = "mean_absolute_error"
presets = "best_quality"
problem_type = "regression"

# Training the model
predictor = TabularPredictor(
    label=label, eval_metric=metric, path="autogluon-models", problem_type=problem_type
).fit(train_data, time_limit=time_limit, presets=presets)

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 60 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: autogluon-models/ds_sub_fit/sub_fit_ho.
Running the sub-fit in a ray process to avoid memory leakage.
Spend 18 seconds for the sub-fit(s) during dynamic stacking.
Time left for full fit of AutoGluon: 42 seconds.
Starting full fit now with num_stack_levels 1.
Beginning AutoGluon training ... Time limit = 42s
AutoGluon will save models to "autogluon-model

In [12]:
predictor.leaderboard(train_data, silent=True)

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RandomForestMSE_BAG_L1,-76421.542625,-208719.984125,mean_absolute_error,0.109656,0.140538,1.185037,0.109656,0.140538,1.185037,1,True,5
1,LightGBM_BAG_L1,-88681.367922,-199726.658,mean_absolute_error,0.258707,0.102533,6.254738,0.258707,0.102533,6.254738,1,True,4
2,LightGBM_BAG_L2,-91787.821781,-208398.518469,mean_absolute_error,0.932376,0.497279,28.069468,0.085823,0.051565,4.996762,2,True,9
3,WeightedEnsemble_L3,-92767.293969,-192580.784344,mean_absolute_error,0.913851,0.493434,28.455857,0.003359,0.000695,0.382854,3,True,10
4,WeightedEnsemble_L2,-93281.657219,-192596.68875,mean_absolute_error,0.840235,0.438571,23.400876,0.003107,0.000774,0.355527,2,True,7
5,LightGBMXT_BAG_L2,-101265.523062,-200564.490438,mean_absolute_error,0.910492,0.492738,28.073003,0.063939,0.047024,5.000297,2,True,8
6,LightGBMXT_BAG_L1,-115920.27375,-203786.166531,mean_absolute_error,0.421891,0.135854,7.041934,0.421891,0.135854,7.041934,1,True,3
7,CatBoost_BAG_L1,-124570.220734,-208093.928031,mean_absolute_error,0.046873,0.058872,8.56364,0.046873,0.058872,8.56364,1,True,6
8,KNeighborsDist_BAG_L1,-481067.166125,-514219.495125,mean_absolute_error,0.004597,0.00392,0.016177,0.004597,0.00392,0.016177,1,True,2
9,KNeighborsUnif_BAG_L1,-481932.6495,-513206.445125,mean_absolute_error,0.004828,0.003998,0.011178,0.004828,0.003998,0.011178,1,True,1


In [13]:
predictions = predictor.predict(X_valid)

In [18]:
csv = pd.DataFrame({'Actual Price': X_valid['Price'], 'Predicted Price': predictions})
csv.to_csv('predictions.csv', index=False)