# House price predict

### Import pyspark module

In [None]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [3]:
# import SparkSession, 有 pip3 install pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType,DoubleType,IntegerType

In [1]:
import pyspark.pandas as ps

In [45]:
# spark ml module
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StringIndexer

### Set spark session

In [4]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("price_predict")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/11 19:30:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Check spark app name
spark.sparkContext.appName

'price_predict'

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [7]:
ps.set_option("compute.default_index_type", "distributed")

In [8]:
# Spark version
spark.version

'3.3.0'

In [9]:
spark

## Feature Engineering

### Data loading & processing

In [49]:
# load data from pandas-output
df = spark.read.csv('dataset/all_A_台北市_A.csv',inferSchema=True, header=True)
df.show()

+---+--------+--------------------+-------------------------------------+----------------------+----------------+------------------+------------------+----------+---------------+------------------+--------+--------------------------+--------------+----------------+------------+----------------------+---------------+---------------+---------------+-----------------+------------+--------+--------------+--------+------------------------+----------+----------------------------------+-------------------+----------+------------+--------+----+--------+
|_c0|鄉鎮市區|            交易標的|                     土地位置建物門牌|土地移轉總面積平方公尺|都市土地使用分區|非都市土地使用分區|非都市土地使用編定|交易年月日|     交易筆棟數|          移轉層次|總樓層數|                  建物型態|      主要用途|        主要建材|建築完成年月|建物移轉總面積平方公尺|建物現況格局-房|建物現況格局-廳|建物現況格局-衛|建物現況格局-隔間|有無管理組織|  總價元|單價元平方公尺|車位類別|車位移轉總面積(平方公尺)|車位總價元|                              備註|               編號|主建物面積|附屬建物面積|陽台面積|電梯|移轉編號|
+---+--------+--------------------+-------------------------------------+-------------

In [50]:
df.count(),len(df.columns)

(245086, 34)

In [51]:
# print dataframe schema
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- 鄉鎮市區: string (nullable = true)
 |-- 交易標的: string (nullable = true)
 |-- 土地位置建物門牌: string (nullable = true)
 |-- 土地移轉總面積平方公尺: double (nullable = true)
 |-- 都市土地使用分區: string (nullable = true)
 |-- 非都市土地使用分區: string (nullable = true)
 |-- 非都市土地使用編定: string (nullable = true)
 |-- 交易年月日: integer (nullable = true)
 |-- 交易筆棟數: string (nullable = true)
 |-- 移轉層次: string (nullable = true)
 |-- 總樓層數: string (nullable = true)
 |-- 建物型態: string (nullable = true)
 |-- 主要用途: string (nullable = true)
 |-- 主要建材: string (nullable = true)
 |-- 建築完成年月: string (nullable = true)
 |-- 建物移轉總面積平方公尺: double (nullable = true)
 |-- 建物現況格局-房: integer (nullable = true)
 |-- 建物現況格局-廳: integer (nullable = true)
 |-- 建物現況格局-衛: integer (nullable = true)
 |-- 建物現況格局-隔間: string (nullable = true)
 |-- 有無管理組織: string (nullable = true)
 |-- 總價元: long (nullable = true)
 |-- 單價元平方公尺: integer (nullable = true)
 |-- 車位類別: string (nullable = true)
 |-- 車位移轉總面積(平方公尺): double (nullabl

In [34]:
df.describe().show()



+-------+-----------------+--------+--------+-------------------------------+----------------------+-----------------------+------------------+------------------+------------------+---------------+----------------+--------+--------------------------+--------------------------+--------------+------------------+----------------------+-----------------+-----------------+------------------+-----------------+------------+--------------------+------------------+--------+------------------------+----------------+------------------------+--------------------+------------------+------------------+------------------+-----+------------------+
|summary|              _c0|鄉鎮市區|交易標的|               土地位置建物門牌|土地移轉總面積平方公尺|       都市土地使用分區|非都市土地使用分區|非都市土地使用編定|        交易年月日|     交易筆棟數|        移轉層次|總樓層數|                  建物型態|                  主要用途|      主要建材|      建築完成年月|建物移轉總面積平方公尺|  建物現況格局-房|  建物現況格局-廳|   建物現況格局-衛|建物現況格局-隔間|有無管理組織|              總價元|    單價元平方公尺|車位類別|車位移轉總面積(平方公尺)|      車位總價元|              

                                                                                

In [35]:
# info about dataframe
df.summary().show()



+-------+-----------------+--------+--------+-------------------------------+----------------------+-----------------------+------------------+------------------+------------------+---------------+----------------+--------+--------------------------+--------------------------+--------------+------------------+----------------------+-----------------+-----------------+------------------+-----------------+------------+--------------------+------------------+--------+------------------------+----------------+------------------------+--------------------+------------------+------------------+------------------+-----+------------------+
|summary|              _c0|鄉鎮市區|交易標的|               土地位置建物門牌|土地移轉總面積平方公尺|       都市土地使用分區|非都市土地使用分區|非都市土地使用編定|        交易年月日|     交易筆棟數|        移轉層次|總樓層數|                  建物型態|                  主要用途|      主要建材|      建築完成年月|建物移轉總面積平方公尺|  建物現況格局-房|  建物現況格局-廳|   建物現況格局-衛|建物現況格局-隔間|有無管理組織|              總價元|    單價元平方公尺|車位類別|車位移轉總面積(平方公尺)|      車位總價元|              

                                                                                

### Select features

In [52]:
# select columns
features_df = df.select('鄉鎮市區', '交易標的', '建物移轉總面積平方公尺','主建物面積', '建物現況格局-房', '車位總價元', '主要建材', '總價元').show(5)

+--------+--------------------+----------------------+----------+---------------+----------+----------------+--------+
|鄉鎮市區|            交易標的|建物移轉總面積平方公尺|主建物面積|建物現況格局-房|車位總價元|        主要建材|  總價元|
+--------+--------------------+----------------------+----------+---------------+----------+----------------+--------+
|  文山區|     房地(土地+建物)|                 35.56|     21.97|              1|         0|    鋼筋混凝土造| 5750000|
|  文山區|     房地(土地+建物)|                115.48|     62.32|              2|         0|鋼骨鋼筋混凝土造|22600000|
|  文山區|     房地(土地+建物)|                 41.01|     26.01|              1|         0|    鋼筋混凝土造| 6000000|
|  文山區|房地(土地+建物)+車位|                219.08|    102.64|              3|         0|    鋼筋混凝土造|29200000|
|  文山區|     房地(土地+建物)|                 83.37|     57.07|              3|         0|    鋼筋混凝土造| 8000000|
+--------+--------------------+----------------------+----------+---------------+----------+----------------+--------+
only showing top 5 rows



In [46]:
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
features_df = assembler.transform(df)

IllegalArgumentException: Data type string of column 鄉鎮市區 is not supported.
Data type string of column 交易標的 is not supported.
Data type string of column 土地位置建物門牌 is not supported.
Data type string of column 都市土地使用分區 is not supported.
Data type string of column 非都市土地使用分區 is not supported.
Data type string of column 非都市土地使用編定 is not supported.
Data type string of column 交易筆棟數 is not supported.
Data type string of column 移轉層次 is not supported.
Data type string of column 總樓層數 is not supported.
Data type string of column 建物型態 is not supported.
Data type string of column 主要用途 is not supported.
Data type string of column 主要建材 is not supported.
Data type string of column 建築完成年月 is not supported.
Data type string of column 建物現況格局-隔間 is not supported.
Data type string of column 有無管理組織 is not supported.
Data type string of column 車位類別 is not supported.
Data type string of column 備註 is not supported.
Data type string of column 編號 is not supported.
Data type string of column 電梯 is not supported.

In [None]:
# validate the presence of dense vectors 
features_df.printSchema()

In [None]:
# view the details of dense vector
features_df.select('features').show(5,False)

In [None]:
# only select the features and label column
model_df = features_df.select(['features', '總價元'])

In [None]:
# Reading for machine learning
model_df.show(10,False)

In [None]:
# size of model df
model_df.count(), len(model_df.columns)

### Split Data - Train & Test sets

In [None]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.80, 0.20], seed=42)

### Build Linear Regression Model 

In [None]:
LinearRegression?

In [None]:
reg = 0.05

In [None]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='總價元', regParam=reg)

In [None]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [None]:
lr_model.intercept

In [None]:
lr_model.coefficients

In [None]:
training_predictions=lr_model.evaluate(train_df)

In [None]:
training_predictions.meanSquaredError

In [None]:
training_predictions.r2

### Evaluate Model

In [None]:
# make predictions on test data 
test_results = lr_model.evaluate(test_df)

In [None]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

In [None]:
# coefficient of determination value for model
test_results.r2

In [None]:
# RMSE
test_results.rootMeanSquaredError

In [None]:
# MSE
test_results.meanSquaredError

### Load data

In [None]:
# Load csv Dataset 
psdf_a = ps.read_csv('data/all_A_taipei_A.csv')
psdf_b = ps.read_csv('data/all_A_taipei_B.csv')
sdf_a = psdf_a.to_spark
sdf_b = psdf_b.to_spark

### Data processing

In [None]:
# combine two dataframe
psdf_concat = ps.concat([psdf_a, psdf_b], join="inner")
psdf_concat.index += 1

In [None]:
# select specific columns use fancy index
psdf_fi = psdf_concat[["鄉鎮市區", "交易標的", "土地移轉總面積平方公尺", "交易年月日", "移轉層次", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "總價元"]]
psdf_fi.show()

### Delete useless columns (garage & land)

In [None]:
# delete garage and land rows

psdf_fi.filter(psdf_fi["交易標的"] == "車位").show(false)


In [None]:
psdf_fi.drop(useless_columns)
# psdf_main = psdf_fi.drop(useless_columns)
# psdf_main

In [None]:
df_main["移轉層次"] == None
df_main = df_main[df_main['移轉層次'].notna()]
df_main

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# load 
# df = pd.read_csv('./dataset/housing.csv', header = None, delim_whitespace=True)
df = test_df
# print(df)

data_y = df[10]
data_x = df.drop([10], axis = 1)
# print(data_y)
# print(data_x)

# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_x, data_y, test_size=0.1, random_state=1)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     