**Import Necessary Library**

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,StackingRegressor
from xgboost import XGBRegressor

**Load Dataset**

In [2]:
df=pd.read_csv('/content/melb_data.csv')

In [3]:
df.head() # it show first five rows of dataset

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
df.shape # show no of rows and columns

(13580, 21)

In [5]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Lattitude,Longtitude,Propertycount
count,13580.0,13580.0,13580.0,13580.0,13580.0,13580.0,13518.0,13580.0,7130.0,8205.0,13580.0,13580.0,13580.0
mean,2.937997,1075684.0,10.137776,3105.301915,2.914728,1.534242,1.610075,558.416127,151.96765,1964.684217,-37.809203,144.995216,7454.417378
std,0.955748,639310.7,5.868725,90.676964,0.965921,0.691712,0.962634,3990.669241,541.014538,37.273762,0.07926,0.103916,4378.581772
min,1.0,85000.0,0.0,3000.0,0.0,0.0,0.0,0.0,0.0,1196.0,-38.18255,144.43181,249.0
25%,2.0,650000.0,6.1,3044.0,2.0,1.0,1.0,177.0,93.0,1940.0,-37.856822,144.9296,4380.0
50%,3.0,903000.0,9.2,3084.0,3.0,1.0,2.0,440.0,126.0,1970.0,-37.802355,145.0001,6555.0
75%,3.0,1330000.0,13.0,3148.0,3.0,2.0,2.0,651.0,174.0,1999.0,-37.7564,145.058305,10331.0
max,10.0,9000000.0,48.1,3977.0,20.0,8.0,10.0,433014.0,44515.0,2018.0,-37.40853,145.52635,21650.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [8]:
df.isnull().sum() # checking for null values in dataset

Unnamed: 0,0
Suburb,0
Address,0
Rooms,0
Type,0
Price,0
Method,0
SellerG,0
Date,0
Distance,0
Postcode,0


**Handling Null values**

In [10]:
#drop columns because they have more null values and its not useful for prediction
df.drop(columns=['BuildingArea','YearBuilt','CouncilArea','Address'],inplace=True)

In [12]:
df['Car'].fillna(df['Car'].median(),inplace=True) # handle car null values with median

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Car'].fillna(df['Car'].median(),inplace=True)


In [13]:
df.isnull().sum() # recheking for null values

Unnamed: 0,0
Suburb,0
Rooms,0
Type,0
Price,0
Method,0
SellerG,0
Date,0
Distance,0
Postcode,0
Bedroom2,0


In [14]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,Northern Metropolitan,4019.0


In [15]:
# Encode Categorical Variables
cat_cols=df.select_dtypes(include='object').columns

# Apply Label Encoding
le=LabelEncoder()
for col in cat_cols:
    df[col]=le.fit_transform(df[col])

df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,Longtitude,Regionname,Propertycount
0,0,2,0,1480000.0,1,23,45,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,144.9984,2,4019.0
1,0,2,0,1035000.0,1,23,47,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,144.9934,2,4019.0
2,0,3,0,1465000.0,3,23,48,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,144.9944,2,4019.0
3,0,3,0,850000.0,0,23,48,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,144.9969,2,4019.0
4,0,4,0,1600000.0,4,155,49,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,144.9941,2,4019.0


**Train_Test_Split**

In [16]:
#split the features and label
X=df.drop('Price',axis=1)
y=df['Price']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Training**

In [18]:
# Train Random Forest Regressor
rf=RandomForestRegressor(n_estimators=100,random_state=42)
rf.fit(X_train,y_train)

In [19]:
# Train XGBRegressor Model
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
xgb.fit(X_train, y_train)

**Ensemble Model (Combining Multiple Model )**

In [20]:
#Ensemble Model using Stacking

stack = StackingRegressor(
    estimators=[('rf',rf),('xgb',xgb)],
    final_estimator=GradientBoostingRegressor(n_estimators=100)
)
stack.fit(X_train, y_train)

**Evaluate Model Performance**

In [22]:
def evaluation(model, name):
    y_pred =model.predict(X_test)
    rmse =np.sqrt(mean_squared_error(y_test, y_pred))
    r2 =r2_score(y_test, y_pred)
    print(f"{name}RMSE:{rmse:.2f},Rsquare:{r2:.3f}")

evaluation(rf,"Random Forest ")
evaluation(xgb,"XGBoost ")
evaluation(stack,"Stacking Regressor ")

Random Forest RMSE:272976.18,Rsquare:0.812
XGBoost RMSE:262685.73,Rsquare:0.826
Stacking Regressor RMSE:269998.69,Rsquare:0.816
