In [1]:
import pandas as pd

df = pd.read_csv('Melbourne_housing_FULL17.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [2]:
df.nunique()

Suburb             351
Address          34009
Rooms               12
Type                 3
Price             2871
Method               9
SellerG            388
Date                78
Distance           215
Postcode           211
Bedroom2            15
Bathroom            11
Car                 15
Landsize          1684
BuildingArea       740
YearBuilt          160
CouncilArea         33
Lattitude        13402
Longtitude       14524
Regionname           8
Propertycount      342
dtype: int64

In [3]:
df.shape

(34857, 21)

In [4]:
# drop some irrelevant columns
columns_kept = ['Suburb', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'CouncilArea', 'Regionname', 'Propertycount']
df = df[columns_kept]
df.head()

Unnamed: 0,Suburb,Rooms,Type,Price,Method,SellerG,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,CouncilArea,Regionname,Propertycount
0,Abbotsford,2,h,,SS,Jellis,2.5,2.0,1.0,1.0,126.0,,Yarra City Council,Northern Metropolitan,4019.0
1,Abbotsford,2,h,1480000.0,S,Biggin,2.5,2.0,1.0,1.0,202.0,,Yarra City Council,Northern Metropolitan,4019.0
2,Abbotsford,2,h,1035000.0,S,Biggin,2.5,2.0,1.0,0.0,156.0,79.0,Yarra City Council,Northern Metropolitan,4019.0
3,Abbotsford,3,u,,VB,Rounds,2.5,3.0,2.0,1.0,0.0,,Yarra City Council,Northern Metropolitan,4019.0
4,Abbotsford,3,h,1465000.0,SP,Biggin,2.5,3.0,2.0,0.0,134.0,150.0,Yarra City Council,Northern Metropolitan,4019.0


In [5]:
df.shape

(34857, 15)

In [6]:
# check for NA values
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        3
dtype: int64

In [7]:
# replace the NA's of several columns with 0's
zero_na_columns = ['Distance', 'Bedroom2', 'Bathroom', 'Car', 'Propertycount']
df[zero_na_columns] = df[zero_na_columns].fillna(0)
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Distance             0
Bedroom2             0
Bathroom             0
Car                  0
Landsize         11810
BuildingArea     21115
CouncilArea          3
Regionname           3
Propertycount        0
dtype: int64

In [8]:
# replace the NA's of several other columns with their mean values
df.Landsize = df.Landsize.fillna(df.Landsize.mean())
df.BuildingArea = df.BuildingArea.fillna(df.BuildingArea.mean())
df.isna().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
Method              0
SellerG             0
Distance            0
Bedroom2            0
Bathroom            0
Car                 0
Landsize            0
BuildingArea        0
CouncilArea         3
Regionname          3
Propertycount       0
dtype: int64

In [9]:
# there's only 3 NA's in columns BuildingArea and Regionname, so just drop the NA's
# also drop the NA's from Price without explanation
df.dropna(inplace=True)
df.isna().sum()

Suburb           0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Distance         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
CouncilArea      0
Regionname       0
Propertycount    0
dtype: int64

In [10]:
# perform one-hot encoding
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount,Suburb_Aberfeldie,...,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
1,2,1480000.0,2.5,2.0,1.0,1.0,202.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
2,2,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
4,3,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
5,3,850000.0,2.5,3.0,2.0,1.0,94.0,160.2564,4019.0,0,...,0,1,0,0,1,0,0,0,0,0
6,4,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,4019.0,0,...,0,1,0,0,1,0,0,0,0,0


In [11]:
from sklearn.model_selection import train_test_split

X = df.drop('Price', axis=1)
y = df.Price
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

In [15]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()
linear.fit(X_train, y_train)
# score on test set of 0.1385, which is very low
linear.score(X_test, y_test)

0.1385368316162079

In [16]:
# score on train set at 0.6827, which is much higher
# which means the model is overfitting the train set
linear.score(X_train, y_train)

0.6827792395792723

In [18]:
# use sklearn Lasso which is L1 regularization
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=50, max_iter=100, tol=0.1)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)
# now the test score is 0.6637, which is much better than without regularization

  positive)


0.6637669697137102

In [19]:
# and the training score is 0.6767 which is approximate that of the test score
lasso.score(X_train, y_train)

0.6767356948457683

In [20]:
# now let's try L2 regularization, which is sklearn Ridge
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=50, max_iter=100, tol=0.1)
ridge.fit(X_train, y_train)
# test score is 0.6670, not bad
ridge.score(X_test, y_test)

0.6670848945194958

In [22]:
# and training score is 0.6622
ridge.score(X_train, y_train)

0.6622376739684328