In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import confusion_matrix,mean_squared_error,r2_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)
dataset = pd.read_csv('./Melbourne_housing_FULL.csv')
dataset

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,1.0,1.0,126.0,,,Yarra City Council,-37.80140,144.99580,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra City Council,-37.79960,144.99840,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.80790,144.99340,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,2.0,1.0,0.0,,,Yarra City Council,-37.81140,145.01160,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.80930,144.99440,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34852,Yarraville,13 Burns St,4,h,1480000.0,PI,Jas,24/02/2018,6.3,3013.0,4.0,1.0,3.0,593.0,,,Maribyrnong City Council,-37.81053,144.88467,Western Metropolitan,6543.0
34853,Yarraville,29A Murray St,2,h,888000.0,SP,Sweeney,24/02/2018,6.3,3013.0,2.0,2.0,1.0,98.0,104.0,2018.0,Maribyrnong City Council,-37.81551,144.88826,Western Metropolitan,6543.0
34854,Yarraville,147A Severn St,2,t,705000.0,S,Jas,24/02/2018,6.3,3013.0,2.0,1.0,2.0,220.0,120.0,2000.0,Maribyrnong City Council,-37.82286,144.87856,Western Metropolitan,6543.0
34855,Yarraville,12/37 Stephen St,3,h,1140000.0,SP,hockingstuart,24/02/2018,6.3,3013.0,,,,,,,Maribyrnong City Council,,,Western Metropolitan,6543.0


# Data Preprocessing

### Step 1: Feature Selection

In [4]:
cols_dropped = ['Address', 'SellerG', 'Date', 'Postcode', 'YearBuilt', 'Lattitude', 'Longtitude', 'CouncilArea', 'Regionname']
df = dataset.drop(columns=cols_dropped, axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34857 entries, 0 to 34856
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         34857 non-null  object 
 1   Rooms          34857 non-null  int64  
 2   Type           34857 non-null  object 
 3   Price          27247 non-null  float64
 4   Method         34857 non-null  object 
 5   Distance       34856 non-null  float64
 6   Bedroom2       26640 non-null  float64
 7   Bathroom       26631 non-null  float64
 8   Car            26129 non-null  float64
 9   Landsize       23047 non-null  float64
 10  BuildingArea   13742 non-null  float64
 11  Propertycount  34854 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 3.2+ MB


### Step 2: Handling NaN values

In [6]:
df = df.dropna(subset=['Price'], axis=0)  # axis=0 là chọn rows ấy!!!
df.shape

(27247, 12)

In [7]:
for i in df.columns:
    if df[i].dtype != 'object':
        print('{:<15}: {}'.format(i, df[i].isna().sum()))

Rooms          : 0
Price          : 0
Distance       : 1
Bedroom2       : 6441
Bathroom       : 6447
Car            : 6824
Landsize       : 9265
BuildingArea   : 16591
Propertycount  : 3


In [8]:
df = df.dropna(subset=['Distance', 'Propertycount'], axis=0)
df.shape

(27244, 12)

In [9]:
cols_to_fill_one = ['Bedroom2', 'Bathroom']
df[cols_to_fill_one] = df[cols_to_fill_one].fillna(1)

In [10]:
df['Car'] = df.Car.fillna(0)

In [11]:
df['Landsize'] = df['Landsize'].fillna(df.Landsize.mean())
df['BuildingArea'] = df['BuildingArea'].fillna(df.BuildingArea.mean())

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 27244 entries, 1 to 34856
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         27244 non-null  object 
 1   Rooms          27244 non-null  int64  
 2   Type           27244 non-null  object 
 3   Price          27244 non-null  float64
 4   Method         27244 non-null  object 
 5   Distance       27244 non-null  float64
 6   Bedroom2       27244 non-null  float64
 7   Bathroom       27244 non-null  float64
 8   Car            27244 non-null  float64
 9   Landsize       27244 non-null  float64
 10  BuildingArea   27244 non-null  float64
 11  Propertycount  27244 non-null  float64
dtypes: float64(8), int64(1), object(3)
memory usage: 2.7+ MB


### Step 3: Scaling

In [13]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount
count,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0
mean,2.992365,1050210.0,11.280634,2.562693,1.451732,1.285898,593.488933,156.834586,7566.781089
std,0.95481,641492.3,6.78758,1.205128,0.662012,1.137214,3052.470303,280.93836,4492.382418
min,1.0,85000.0,0.0,0.0,0.0,0.0,0.0,0.0,83.0
25%,2.0,635000.0,6.4,1.0,1.0,0.0,351.0,156.0,4294.0
50%,3.0,870000.0,10.5,3.0,1.0,1.0,593.488933,156.834586,6567.0
75%,4.0,1295000.0,14.0,3.0,2.0,2.0,593.488933,156.834586,10412.0
max,16.0,11200000.0,48.1,20.0,9.0,18.0,433014.0,44515.0,21650.0


In [14]:
scaler = MinMaxScaler()

cols_to_scale = []
for i in df.columns:
    if df[i].dtype != 'object':
        cols_to_scale.append(i)
   
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])
cols_to_scale

['Rooms',
 'Price',
 'Distance',
 'Bedroom2',
 'Bathroom',
 'Car',
 'Landsize',
 'BuildingArea',
 'Propertycount']

In [15]:
df.describe()

Unnamed: 0,Rooms,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,Propertycount
count,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0,27244.0
mean,0.132824,0.086839,0.234525,0.128135,0.161304,0.071439,0.001371,0.003523,0.347001
std,0.063654,0.057714,0.141114,0.060256,0.073557,0.063179,0.007049,0.006311,0.208299
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.066667,0.049483,0.133056,0.05,0.111111,0.0,0.000811,0.003504,0.195252
50%,0.133333,0.070625,0.218295,0.15,0.111111,0.055556,0.001371,0.003523,0.300645
75%,0.2,0.108862,0.29106,0.15,0.222222,0.111111,0.001371,0.003523,0.478926
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Step 4: One Hot Encoding

In [16]:
for i in df.columns:
    if df[i].dtype == 'object':
        print('{:<15}: {}'.format(i, df[i].nunique()))

Suburb         : 344
Type           : 3
Method         : 5


In [17]:
12 - 3 + 343 + 2 + 4

358

In [18]:
df_ohe = pd.get_dummies(df, drop_first=True, dtype='int16')
df_ohe.shape

(27244, 358)

### Step 5: Splitting Data

In [19]:
X = df_ohe.drop('Price', axis=1).values
Y = df_ohe['Price']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=2)
X_train.shape, X_test.shape

((21795, 357), (5449, 357))

# Linear

In [20]:
LR = LinearRegression()
LR.fit(X_train, Y_train)

In [21]:
LR.coef_.shape

(357,)

In [22]:
LR.intercept_

258417978.28590268

In [23]:
Yp_test = LR.predict(X_test)
Yp_train = LR.predict(X_train)

In [24]:
Yp_test.shape

(5449,)

In [25]:
print(mean_squared_error(Y_train, Yp_train))
print(mean_squared_error(Y_test, Yp_test))

0.001137061882154781
1.3599584694770926e+19


# Lasso (L1)

In [26]:
lasso = Lasso(alpha=50)
lasso.fit(X_train, Y_train)

In [27]:
Yp_train_L1 = lasso.predict(X_train)
Yp_test_L1 = lasso.predict(X_test)

In [28]:
print(mean_squared_error(Y_train, Yp_train_L1))
print(mean_squared_error(Y_test, Yp_test_L1))

0.003340963014536453
0.0032901503307811277


# Ridge (L2)

In [29]:
ridge = Ridge(alpha=50)
ridge.fit(X_train, Y_train)

In [30]:
Yp_train_L2 = ridge.predict(X_train)
Yp_test_L2 = ridge.predict(X_test)

In [31]:
print(mean_squared_error(Y_train, Yp_train_L2))
print(mean_squared_error(Y_test, Yp_test_L2))

0.0013287077210920515
0.001278200947125723


---