# House price predict

### Import pyspark module

In [1]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [2]:
# import SparkSession, 有 pip3 install pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql.types import StringType,DoubleType,IntegerType

In [3]:
import pyspark.pandas as ps

In [4]:
# spark ml module
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

### Set spark session

In [5]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("linear-regression")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/01 18:04:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Check spark app name
spark.sparkContext.appName

'linear-regression'

In [7]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [8]:
ps.set_option("compute.default_index_type", "distributed")

In [9]:
# Spark version
spark.version

'3.3.0'

In [10]:
spark

## Feature Engineering

### Data loading & processing

In [11]:
# load data from pandas-output
psdf = ps.read_csv("taipei_output.csv")



In [12]:
df = psdf.to_spark()
df.show()



22/07/01 18:05:42 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , 鄉鎮市區, 交易標的, 土地移轉總面積平方公尺, 交易年月日, 移轉層次, 建物型態, 建物現況格局-房, 建物現況格局-廳, 建物現況格局-衛, 總價元
 Schema: _c0, 鄉鎮市區, 交易標的, 土地移轉總面積平方公尺, 交易年月日, 移轉層次, 建物型態, 建物現況格局-房, 建物現況格局-廳, 建物現況格局-衛, 總價元
Expected: _c0 but found: 
CSV file: file:///home/dtsurfer07/00_final_project_tutorial/taipei_output.csv
+---+--------+--------+----------------------+----------+--------+--------+---------------+---------------+---------------+--------+
|_c0|鄉鎮市區|交易標的|土地移轉總面積平方公尺|交易年月日|移轉層次|建物型態|建物現況格局-房|建物現況格局-廳|建物現況格局-衛|  總價元|
+---+--------+--------+----------------------+----------+--------+--------+---------------+---------------+---------------+--------+
|  1|       9|       1|                  4.07|   1050418|       5|       4|              1|              1|              1| 5750000|
|  2|       9|       1|                  9.54|   1050327|       7|       0|              2|              2|              1|22600000|
|  3|       9|       1

In [13]:
df = df.drop('_c0')
df.show()

+--------+--------+----------------------+----------+--------+--------+---------------+---------------+---------------+--------+
|鄉鎮市區|交易標的|土地移轉總面積平方公尺|交易年月日|移轉層次|建物型態|建物現況格局-房|建物現況格局-廳|建物現況格局-衛|  總價元|
+--------+--------+----------------------+----------+--------+--------+---------------+---------------+---------------+--------+
|       9|       1|                  4.07|   1050418|       5|       4|              1|              1|              1| 5750000|
|       9|       1|                  9.54|   1050327|       7|       0|              2|              2|              1|22600000|
|       9|       1|                 11.53|   1050331|       2|       4|              1|              0|              1| 6000000|
|       9|       2|                 52.84|   1050327|       9|       0|              3|              2|              2|29200000|
|       9|       1|                 15.69|   1050502|       3|       8|              3|              2|              2| 8000000|
|       9|       2|     

### Select features

In [14]:
df.columns

['鄉鎮市區',
 '交易標的',
 '土地移轉總面積平方公尺',
 '交易年月日',
 '移轉層次',
 '建物型態',
 '建物現況格局-房',
 '建物現況格局-廳',
 '建物現況格局-衛',
 '總價元']

In [15]:
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
features_df = assembler.transform(df)

In [16]:
# validate the presence of dense vectors 
features_df.printSchema()

root
 |-- 鄉鎮市區: integer (nullable = true)
 |-- 交易標的: integer (nullable = true)
 |-- 土地移轉總面積平方公尺: double (nullable = true)
 |-- 交易年月日: integer (nullable = true)
 |-- 移轉層次: integer (nullable = true)
 |-- 建物型態: integer (nullable = true)
 |-- 建物現況格局-房: integer (nullable = true)
 |-- 建物現況格局-廳: integer (nullable = true)
 |-- 建物現況格局-衛: integer (nullable = true)
 |-- 總價元: long (nullable = true)
 |-- features: vector (nullable = true)



In [17]:
# view the details of dense vector
features_df.select('features').show(5,False)

+---------------------------------------------+
|features                                     |
+---------------------------------------------+
|[9.0,1.0,4.07,1050418.0,5.0,4.0,1.0,1.0,1.0] |
|[9.0,1.0,9.54,1050327.0,7.0,0.0,2.0,2.0,1.0] |
|[9.0,1.0,11.53,1050331.0,2.0,4.0,1.0,0.0,1.0]|
|[9.0,2.0,52.84,1050327.0,9.0,0.0,3.0,2.0,2.0]|
|[9.0,1.0,15.69,1050502.0,3.0,8.0,3.0,2.0,2.0]|
+---------------------------------------------+
only showing top 5 rows



In [18]:
# only select the features and label column
model_df = features_df.select(['features', '總價元'])

In [19]:
# Reading for machine learning
model_df.show(10,False)

+----------------------------------------------+--------+
|features                                      |總價元  |
+----------------------------------------------+--------+
|[9.0,1.0,4.07,1050418.0,5.0,4.0,1.0,1.0,1.0]  |5750000 |
|[9.0,1.0,9.54,1050327.0,7.0,0.0,2.0,2.0,1.0]  |22600000|
|[9.0,1.0,11.53,1050331.0,2.0,4.0,1.0,0.0,1.0] |6000000 |
|[9.0,2.0,52.84,1050327.0,9.0,0.0,3.0,2.0,2.0] |29200000|
|[9.0,1.0,15.69,1050502.0,3.0,8.0,3.0,2.0,2.0] |8000000 |
|[9.0,2.0,99.65,1050411.0,4.0,8.0,4.0,2.0,2.0] |35000000|
|[11.0,2.0,9.32,1050419.0,6.0,8.0,2.0,2.0,1.0] |12700000|
|[11.0,1.0,19.23,1050314.0,9.0,0.0,2.0,1.0,1.0]|12200000|
|[11.0,1.0,14.04,1050408.0,1.0,4.0,1.0,1.0,1.0]|7400000 |
|[3.0,1.0,41.79,1050412.0,5.0,2.0,3.0,2.0,2.0] |17300000|
+----------------------------------------------+--------+
only showing top 10 rows



In [20]:
# size of model df
model_df.count(), len(model_df.columns)

(215447, 2)

### Split Data - Train & Test sets

In [21]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.80, 0.20], seed=42)

### Build Linear Regression Model 

In [22]:
LinearRegression?

[0;31mInit signature:[0m [0mLinearRegression[0m[0;34m([0m[0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwds[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Linear regression.

The learning objective is to minimize the specified loss function, with regularization.
This supports two kinds of loss:

* squaredError (a.k.a squared loss)
* huber (a hybrid of squared error for relatively small errors and absolute error for     relatively large ones, and we estimate the scale parameter from training data)

This supports multiple types of regularization:

* none (a.k.a. ordinary least squares)
* L2 (ridge regression)
* L1 (Lasso)
* L2 + L1 (elastic net)

.. versionadded:: 1.4.0

Notes
-----
Fitting with huber loss only supports none and L2 regularization.

Examples
--------
>>> from pyspark.ml.linalg import Vectors
>>> df = spark.createDataFrame([
...     (1.0, 2.0, Vectors.dense(1.0)),
...     (0.0, 2.0, Vectors.sparse(1, [], []))], ["label", "weight", "featu

In [23]:
reg = 0.05

In [24]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='總價元', regParam=reg)

In [25]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

[Stage 9:>                                                          (0 + 2) / 2]

22/07/01 18:10:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/07/01 18:10:36 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

22/07/01 18:10:36 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [26]:
lr_model.intercept

-18646315.116238013

In [27]:
lr_model.coefficients

DenseVector([-523311.1159, 13989923.8726, 562669.5402, 10.7753, 1588984.7497, -618515.2602, -697429.4822, -5738602.6814, 3165412.6782])

In [28]:
training_predictions=lr_model.evaluate(train_df)

                                                                                

In [29]:
training_predictions.meanSquaredError

3217561394613352.0

In [30]:
training_predictions.r2

0.4509578232694469

### Evaluate Model

In [31]:
# make predictions on test data 
test_results = lr_model.evaluate(test_df)

                                                                                

In [32]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

+--------------------+
|residuals           |
+--------------------+
|1.2262323379963584E7|
|6.348977440654529E7 |
|9932297.709839758   |
|-5731341.407355636  |
|-1409326.1189673245 |
|-5885775.398269951  |
|1.3789892194402333E7|
|446290.8342085257   |
|9978119.694161944   |
|9978119.694161944   |
+--------------------+
only showing top 10 rows



                                                                                

In [33]:
# coefficient of determination value for model
test_results.r2

0.12203767955346456

In [34]:
# RMSE
test_results.rootMeanSquaredError

43464067.17970187

In [35]:
# MSE
test_results.meanSquaredError

1889125135801637.8

### Load data

In [None]:
# Load csv Dataset 
psdf_a = ps.read_csv('data/all_A_taipei_A.csv')
psdf_b = ps.read_csv('data/all_A_taipei_B.csv')
sdf_a = psdf_a.to_spark
sdf_b = psdf_b.to_spark

### Data processing

In [None]:
# combine two dataframe
psdf_concat = ps.concat([psdf_a, psdf_b], join="inner")
psdf_concat.index += 1

In [None]:
# select specific columns use fancy index
psdf_fi = psdf_concat[["鄉鎮市區", "交易標的", "土地移轉總面積平方公尺", "交易年月日", "移轉層次", "建物型態", "建物現況格局-房", "建物現況格局-廳", "建物現況格局-衛", "總價元"]]
psdf_fi.show()

### Delete useless columns (garage & land)

In [None]:
# delete garage and land rows

psdf_fi.filter(psdf_fi["交易標的"] == "車位").show(false)


In [None]:
psdf_fi.drop(useless_columns)
# psdf_main = psdf_fi.drop(useless_columns)
# psdf_main

In [None]:
df_main["移轉層次"] == None
df_main = df_main[df_main['移轉層次'].notna()]
df_main

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# load 
# df = pd.read_csv('./dataset/housing.csv', header = None, delim_whitespace=True)
df = test_df
# print(df)

data_y = df[10]
data_x = df.drop([10], axis = 1)
# print(data_y)
# print(data_x)

# split
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(data_x, data_y, test_size=0.1, random_state=1)

# transform
scaler = preprocessing.StandardScaler().fit(data_X_train)
data_X_train = scaler.transform(data_X_train)

# linear regression
model = linear_model.LinearRegression()
model.fit(data_X_train, data_y_train)
          
# make predictions
data_X_test = scaler.transform(data_X_test)
data_y_pred = model.predict(data_X_test)

# The coefficients
print('Coefficients: {}\n'.format(model.coef_))
# The mean squared error
print("Mean squared error: {}".format((mean_squared_error(data_y_test, data_y_pred))))
# Explained variance score: 1 is perfect prediction
print('R2 score: {}'.format(r2_score(data_y_test, data_y_pred)))     