# House price predict

### Import pyspark module

In [None]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [1]:
# import SparkSession, 有 pip3 install pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as fn
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, DoubleType, IntegerType

In [2]:
import pyspark.pandas as ps

In [3]:
# spark ml module
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

In [None]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

### Set spark session

In [4]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("price_predict")\
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/07/15 16:23:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Check spark app name
spark.sparkContext.appName

'price_predict'

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [7]:
ps.set_option("compute.default_index_type", "distributed")

In [None]:
# Spark version
spark.version

In [None]:
spark

## Feature Engineering

### Load data

In [None]:
# df = df.to_spark()

In [9]:
df = spark.read.csv('file:///home/dtsurfer07/00_final_project_tutorial/dataset/all_combined_AB.csv', inferSchema=True, header=True)
df.createOrReplaceTempView("dfTable")



22/07/15 16:24:27 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


                                                                                

In [None]:
df.show(5)

### Inspect data

In [None]:
df.columns

In [None]:
df.count(),len(df.columns)

In [None]:
# print dataframe schema
df.printSchema()

In [None]:
df.describe().show()

In [None]:
# info about dataframe
df.summary().show()

### Select features

In [25]:
# select columns
# df = df.filter(coalesce('鄉鎮市區', '交易標的', '建物移轉總面積平方公尺','主建物面積', '建物現況格局-房', '車位總價元', '主要建材', '總價元').isNotNull())
features_df = df.select('城市代碼', '鄉鎮市區', '交易標的', '建物移轉總面積平方公尺', '主建物面積', '建物現況格局-房', '車位總價元', '總價元') # without '主要建材'
features_df.count(), len(features_df.columns)

                                                                                

(3678001, 8)

In [None]:
# print dataframe schema
features_df.printSchema()

In [26]:
# select target without garage & land
features_df = features_df.filter(~col('交易標的').isin(['車位', '土地']))
features_df = features_df.filter(~(col('主建物面積') == 0))
features_df.count(), len(features_df.columns)

                                                                                

(2398819, 8)

In [27]:
features_df = features_df.withColumn("總價元", df.總價元/10000)
features_df = features_df.withColumn("車位總價元", df.車位總價元/10000)

In [28]:
features_df = features_df.withColumn("建物移轉總面積平方公尺", features_df["建物移轉總面積平方公尺"].cast(DoubleType()))
features_df = features_df.withColumn("主建物面積", features_df["主建物面積"].cast(DoubleType()))
features_df = features_df.withColumn("建物現況格局-房", features_df["建物現況格局-房"].cast(IntegerType()))
features_df = features_df.withColumn("車位總價元", features_df["車位總價元"].cast(IntegerType()))
features_df = features_df.withColumn("總價元", features_df["總價元"].cast(IntegerType()))

In [None]:
# print dataframe schema
features_df.printSchema()

In [None]:
features_df.show(10)

In [29]:
# drop columns
features_df = features_df.dropna()
features_df.count()

                                                                                

2398819

In [None]:
# check nan values
features_df.select(
    [count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in features_df.columns]
   ).show()

In [None]:
features_df.describe().show()

In [None]:
# info about dataframe
features_df.summary().show()

### Select City to build model

In [30]:
# ABDEFH

def split_city(df, city_code):
    df = df.filter(col('城市代碼') == city_code)
    return df

In [37]:
city_df = split_city(features_df, "H")
city_df.count()

                                                                                

327369

### Feature engineering

In [38]:
def dummies_encoding(df, cols_list):
    for i in cols_list:
        categ = df.select(i).distinct().rdd.flatMap(lambda x:x).collect()
        exprs = [fn.when(fn.col(i) == cat,1).otherwise(0)\
                .alias(str(cat)) for cat in categ]
        df = df.select(exprs + df.columns)
    return df    

In [39]:
cols_list = ['鄉鎮市區', '交易標的'] # without '主要建材'
city_df = dummies_encoding(city_df, cols_list)

                                                                                

In [40]:
# delete a column
new_df = city_df.drop('鄉鎮市區', '交易標的', '城市代碼')

In [None]:
new_df.show(5)

In [41]:
feature_cols = new_df.columns[:-1]
assembler = VectorAssembler(inputCols = feature_cols, outputCol = 'features')
new_df = assembler.transform(new_df)

In [None]:
# validate the presence of dense vectors 
new_df.printSchema()

In [None]:
# view the details of dense vector
new_df.select('features').show(5,False)

In [42]:
# only select the features and label column
model_df = new_df.select(['features', '總價元'])

In [43]:
model_df = model_df.withColumnRenamed('總價元', 'price')

In [None]:
# Reading for machine learning
model_df.show(10,False)

In [None]:
# size of model df
model_df.count(), len(model_df.columns)

### Split Data - Train & Test sets

In [48]:
# use Logistic Regression to train on the training set
train_df, test_df = model_df.randomSplit([0.70, 0.30])

### Build Linear Regression with lasso

In [None]:
import sys

from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import StreamingLinearRegressionWithSGD

def parse(lp):
    label = float(lp[lp.find('(') + 1: lp.find(',')])
    vec = Vectors.dense(lp[lp.find('[') + 1: lp.find(']')].split(','))
    return LabeledPoint(label, vec)

trainingData = model_df.textFileStream(sys.argv[1]).map(parse).cache()
testData = model_df.textFileStream(sys.argv[2]).map(parse)

### Build SMVModel 

In [None]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.regression import LabeledPoint

# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.split(' ')]
    return LabeledPoint(values[0], values[1:])

data = sc.textFile("data/mllib/sample_svm_data.txt")
parsedData = data.map(parsePoint)

# Build the model
model = SVMWithSGD.train(parsedData, iterations=100)

# Evaluating the model on training data
labelsAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
trainErr = labelsAndPreds.filter(lambda lp: lp[0] != lp[1]).count() / float(parsedData.count())
print("Training Error = " + str(trainErr))

# Save and load model
model.save(sc, "target/tmp/pythonSVMWithSGDModel")
sameModel = SVMModel.load(sc, "target/tmp/pythonSVMWithSGDModel")

### Build Linear Regression Model 

In [49]:
reg = 0.05
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='price', regParam=reg)

# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

training_predictions=lr_model.evaluate(train_df)

print(training_predictions.meanSquaredError)
print(training_predictions.rootMeanSquaredError)
print(training_predictions.r2)



2937361.8000198975
1713.8733325482071
0.7734641744594577


                                                                                

In [None]:
reg = 0.05

In [None]:
# Build Linear Regression model 
lin_Reg=LinearRegression(labelCol='price', regParam=reg)

In [None]:
# fit the linear regression model on training data set 
lr_model=lin_Reg.fit(train_df)

In [None]:
lr_model.intercept

In [None]:
lr_model.coefficients

In [None]:
training_predictions=lr_model.evaluate(train_df)

In [None]:
training_predictions.meanSquaredError

In [None]:
training_predictions.rootMeanSquaredError

In [None]:
training_predictions.r2

### Evaluate Model

In [None]:
# make predictions on test data 
test_results = lr_model.evaluate(test_df)

In [None]:
# view the residual errors based on predictions 
test_results.residuals.show(10,False)

In [None]:
# coefficient of determination value for model
test_results.r2

In [None]:
# RMSE
test_results.rootMeanSquaredError

In [None]:
# MSE
test_results.meanSquaredError