### Importing Libraries

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings 
warnings.fiterwarings('ignore')

### Dataset

In [5]:
df=pd.read_csv('MELBOURNE_HOUSE_PRICES_LESS.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Postcode,Regionname,Propertycount,Distance,CouncilArea
0,Abbotsford,49 Lithgow St,3,h,1490000.0,S,Jellis,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
1,Abbotsford,59A Turner St,3,h,1220000.0,S,Marshall,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
2,Abbotsford,119B Yarra St,3,h,1420000.0,S,Nelson,1/04/2017,3067,Northern Metropolitan,4019,3.0,Yarra City Council
3,Aberfeldie,68 Vida St,3,h,1515000.0,S,Barry,1/04/2017,3040,Western Metropolitan,1543,7.5,Moonee Valley City Council
4,Airport West,92 Clydesdale Rd,2,h,670000.0,S,Nelson,1/04/2017,3042,Western Metropolitan,3464,10.4,Moonee Valley City Council


In [7]:
df.nunique()

Suburb             380
Address          57754
Rooms               14
Type                 3
Price             3417
Method               9
SellerG            476
Date               112
Postcode           225
Regionname           8
Propertycount      368
Distance           180
CouncilArea         34
dtype: int64

### Removing unnecessary columns

In [10]:
cols_to_use = ['Suburb', 'Rooms', 'Type', 'Method', 'SellerG', 'Regionname', 'Propertycount', 
               'Distance', 'CouncilArea', 'Price']
df = df[cols_to_use]

In [11]:
df.head()

Unnamed: 0,Suburb,Rooms,Type,Method,SellerG,Regionname,Propertycount,Distance,CouncilArea,Price
0,Abbotsford,3,h,S,Jellis,Northern Metropolitan,4019,3.0,Yarra City Council,1490000.0
1,Abbotsford,3,h,S,Marshall,Northern Metropolitan,4019,3.0,Yarra City Council,1220000.0
2,Abbotsford,3,h,S,Nelson,Northern Metropolitan,4019,3.0,Yarra City Council,1420000.0
3,Aberfeldie,3,h,S,Barry,Western Metropolitan,1543,7.5,Moonee Valley City Council,1515000.0
4,Airport West,2,h,S,Nelson,Western Metropolitan,3464,10.4,Moonee Valley City Council,670000.0


In [13]:
df.isna().sum()

Suburb               0
Rooms                0
Type                 0
Method               0
SellerG              0
Regionname           0
Propertycount        0
Distance             0
CouncilArea          0
Price            14590
dtype: int64

### Filling NaN values, if any

In [15]:
cols_to_fill_zero = ['Propertycount', 'Distance']
df[cols_to_fill_zero] = df[cols_to_fill_zero].fillna(0)

In [16]:
df.dropna(inplace=True)

In [18]:
df = pd.get_dummies(df, drop_first=True)

In [19]:
df.head()

Unnamed: 0,Rooms,Propertycount,Distance,Price,Suburb_Aberfeldie,Suburb_Airport West,Suburb_Albanvale,Suburb_Albert Park,Suburb_Albion,Suburb_Alphington,...,CouncilArea_Moreland City Council,CouncilArea_Murrindindi Shire Council,CouncilArea_Nillumbik Shire Council,CouncilArea_Port Phillip City Council,CouncilArea_Stonnington City Council,CouncilArea_Whitehorse City Council,CouncilArea_Whittlesea City Council,CouncilArea_Wyndham City Council,CouncilArea_Yarra City Council,CouncilArea_Yarra Ranges Shire Council
0,3,4019,3.0,1490000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,3,4019,3.0,1220000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,3,4019,3.0,1420000.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,3,1543,7.5,1515000.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,3464,10.4,670000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
X = df.drop('Price', axis=1)
y = df['Price']

### Splitting dataset into training ans testing data

In [21]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=2)

### Linear Regression

In [22]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(train_X, train_y)

In [23]:
reg.score(test_X, test_y)

-3650796389.216651

In [24]:
reg.score(train_X, train_y)

0.660013083754533

### Lasso Regression

In [26]:
from sklearn import linear_model
lasso_reg = linear_model.Lasso(alpha=100, max_iter=100, tol=0.1)
lasso_reg.fit(train_X, train_y)



Lasso(alpha=100, copy_X=True, fit_intercept=True, max_iter=100,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.1, warm_start=False)

In [27]:
lasso_reg.score(test_X, test_y)

0.6598838996509366

In [28]:
lasso_reg.score(train_X, train_y)

0.6525769940765858

### Ridge Regression

In [29]:
from sklearn.linear_model import Ridge
ridge_reg= Ridge(alpha=50, max_iter=100, tol=0.1)
ridge_reg.fit(train_X, train_y)

Ridge(alpha=50, copy_X=True, fit_intercept=True, max_iter=100,
   normalize=False, random_state=None, solver='auto', tol=0.1)

In [30]:
ridge_reg.score(test_X, test_y)

0.6573232856316463

In [31]:
ridge_reg.score(train_X, train_y)

0.6471379169766673