# House price predict

### Import pyspark module

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
# import SparkSession, 有 pip3 install pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql.functions import *
from pyspark.sql.types import StringType,DoubleType,IntegerType

In [3]:
import pyspark.pandas as ps



In [4]:
# spark ml module
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

### Set spark session

In [5]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("price_predict")\
        .getOrCreate()

22/07/13 00:17:04 WARN Utils: Your hostname, ubuntu2204-desktop resolves to a loopback address: 127.0.1.1; using 192.168.161.130 instead (on interface ens33)
22/07/13 00:17:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/13 00:17:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Check spark app name
spark.sparkContext.appName

'price_predict'

In [7]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [8]:
ps.set_option("compute.default_index_type", "distributed")

In [None]:
# Spark version
spark.version

In [9]:
spark

## Feature Engineering

### Load data

In [10]:
# load data from pandas-output
df = spark.read.csv('data/all_taipei_A.csv',inferSchema=True, header=True)
df.createOrReplaceTempView("dfTable")

[Stage 1:>                                                          (0 + 4) / 4]

22/07/13 00:17:20 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [None]:
df.show(5)

### Inspect data

In [None]:
df.columns

In [None]:
df.count(),len(df.columns)

In [None]:
# print dataframe schema
df.printSchema()

In [None]:
df.describe().show()

In [None]:
# info about dataframe
df.summary().show()

### Select features

In [42]:
# select columns
# df = df.filter(coalesce('鄉鎮市區', '交易標的', '建物移轉總面積平方公尺','主建物面積', '建物現況格局-房', '車位總價元', '主要建材', '總價元').isNotNull())
features_df = df.select('鄉鎮市區', '交易標的', '建物移轉總面積平方公尺','主建物面積', '建物現況格局-房', '車位總價元', '總價元') # without '主要建材'
features_df.count(), len(features_df.columns)

(245086, 7)

In [43]:
# select target without garage & land
features_df = features_df.filter(~col('交易標的').isin(['車位', '土地']))
features_df.count(), len(features_df.columns)

(208671, 7)

In [27]:
features_df.show(10)

+--------+--------------------+----------------------+----------+---------------+----------+--------+
|鄉鎮市區|            交易標的|建物移轉總面積平方公尺|主建物面積|建物現況格局-房|車位總價元|  總價元|
+--------+--------------------+----------------------+----------+---------------+----------+--------+
|  文山區|     房地(土地+建物)|                 35.56|     21.97|              1|         0| 5750000|
|  文山區|     房地(土地+建物)|                115.48|     62.32|              2|         0|22600000|
|  文山區|     房地(土地+建物)|                 41.01|     26.01|              1|         0| 6000000|
|  文山區|房地(土地+建物)+車位|                219.08|    102.64|              3|         0|29200000|
|  文山區|     房地(土地+建物)|                 83.37|     57.07|              3|         0| 8000000|
|  文山區|房地(土地+建物)+車位|                228.23|    122.69|              4|         0|35000000|
|  萬華區|房地(土地+建物)+車位|                101.03|     60.54|              2|         0|12700000|
|  萬華區|     房地(土地+建物)|                 80.31|     55.51|              2|         0|12200

In [None]:
# print dataframe schema
features_df.printSchema()

In [None]:
features_df.describe().show()

In [None]:
# info about dataframe
features_df.summary().show()

### Feature engineering

In [44]:
def dummies_encoding(df, cols_list):
    for i in cols_list:
        categ = df.select(i).distinct().rdd.flatMap(lambda x:x).collect()
        exprs = [fn.when(fn.col(i) == cat,1).otherwise(0)\
                .alias(str(cat)) for cat in categ]
        df = df.select(exprs + df.columns)
    return df    

In [45]:
cols_list = ['鄉鎮市區', '交易標的'] # without '主要建材'
features_df = dummies_encoding(features_df, cols_list)

In [46]:
# delete a column
new_df = features_df.drop('鄉鎮市區', '交易標的')

In [31]:
new_df.show(5)

+---------------+--------------------+----+------+------+------+------+------+------+------+------+------+------+------+------+----------------------+----------+---------------+----------+--------+
|房地(土地+建物)|房地(土地+建物)+車位|建物|南港區|北投區|內湖區|萬華區|文山區|松山區|信義區|大同區|中正區|中山區|大安區|士林區|建物移轉總面積平方公尺|主建物面積|建物現況格局-房|車位總價元|  總價元|
+---------------+--------------------+----+------+------+------+------+------+------+------+------+------+------+------+------+----------------------+----------+---------------+----------+--------+
|              1|                   0|   0|     0|     0|     0|     0|     1|     0|     0|     0|     0|     0|     0|     0|                 35.56|     21.97|              1|         0| 5750000|
|              1|                   0|   0|     0|     0|     0|     0|     1|     0|     0|     0|     0|     0|     0|     0|                115.48|     62.32|              2|         0|22600000|
|              1|                   0|   0|     0|     0|     0|     0|     1|     0|     0| 

In [47]:
feature_cols = new_df.columns[:-1]
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
new_df = assembler.transform(new_df)

In [None]:
# validate the presence of dense vectors 
new_df.printSchema()

In [48]:
# view the details of dense vector
new_df.select('features').show(5,False)

+-----------------------------------------------+
|features                                       |
+-----------------------------------------------+
|(19,[0,7,15,16,17],[1.0,1.0,35.56,21.97,1.0])  |
|(19,[0,7,15,16,17],[1.0,1.0,115.48,62.32,2.0]) |
|(19,[0,7,15,16,17],[1.0,1.0,41.01,26.01,1.0])  |
|(19,[1,7,15,16,17],[1.0,1.0,219.08,102.64,3.0])|
|(19,[0,7,15,16,17],[1.0,1.0,83.37,57.07,3.0])  |
+-----------------------------------------------+
only showing top 5 rows



In [50]:
# only select the features and label column
model_df = new_df.select(['features', '總價元'])

In [51]:
model_df = model_df.withColumnRenamed('總價元', 'price')

In [52]:
# Reading for machine learning
model_df.show(10,False)

+-----------------------------------------------+--------+
|features                                       |price   |
+-----------------------------------------------+--------+
|(19,[0,7,15,16,17],[1.0,1.0,35.56,21.97,1.0])  |5750000 |
|(19,[0,7,15,16,17],[1.0,1.0,115.48,62.32,2.0]) |22600000|
|(19,[0,7,15,16,17],[1.0,1.0,41.01,26.01,1.0])  |6000000 |
|(19,[1,7,15,16,17],[1.0,1.0,219.08,102.64,3.0])|29200000|
|(19,[0,7,15,16,17],[1.0,1.0,83.37,57.07,3.0])  |8000000 |
|(19,[1,7,15,16,17],[1.0,1.0,228.23,122.69,4.0])|35000000|
|(19,[1,6,15,16,17],[1.0,1.0,101.03,60.54,2.0]) |12700000|
|(19,[0,6,15,16,17],[1.0,1.0,80.31,55.51,2.0])  |12200000|
|(19,[0,6,15,16,17],[1.0,1.0,36.4,36.4,1.0])    |7400000 |
|(19,[0,5,15,16,17],[1.0,1.0,136.4,126.5,3.0])  |17300000|
+-----------------------------------------------+--------+
only showing top 10 rows



In [53]:
# size of model df
model_df.count(), len(model_df.columns)

(208671, 2)

### Split Data - Train & Test sets

In [54]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30], seed=42)

### Build Linear Regression Model 

In [None]:
LinearRegression?

In [55]:
reg = 0.05

In [56]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='price', regParam=reg)

In [57]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

                                                                                

22/07/13 00:27:43 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [58]:
lr_model.intercept

388886.5596443912

In [59]:
lr_model.coefficients

DenseVector([-3130821.3894, 4025474.7752, -26017048.4941, -4070424.8977, -6546996.0003, -5537833.8555, -3319720.5441, -5810060.9278, 4437453.0005, 6603685.5624, -3124830.5487, 5394760.0699, 3789235.4531, 10209082.6741, -1054948.6585, 31790.8317, 298639.3859, -2219495.2255, 3.8132])

In [60]:
training_predictions=lr_model.evaluate(train_df)

                                                                                

In [61]:
training_predictions.meanSquaredError

1411242515275149.2

In [62]:
training_predictions.r2

0.8559234506875827

### Evaluate Model

In [63]:
# make predictions on test data 
test_results = lr_model.evaluate(test_df)

                                                                                

In [64]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

+-------------------+
|residuals          |
+-------------------+
|1.726872996962302E7|
|8804123.710281715  |
|-2615546.7342291363|
|9835102.813935911  |
|5596266.719363058  |
|1725156.0844988679 |
|6153854.165668456  |
|-2468781.7764608506|
|8562956.568887698  |
|8538720.406577948  |
+-------------------+
only showing top 10 rows



In [65]:
# coefficient of determination value for model
test_results.r2

0.7492024451099824

In [66]:
# RMSE
test_results.rootMeanSquaredError

46967662.55893333

In [67]:
# MSE
test_results.meanSquaredError

2205961326249828.2