## Diamond's Pricing model- Pyspark:

In this notebook, using the Apache Spark ML pipeline(RandomForest, DecisionTree), I will build a model to predict the price of a diamond based on the available features.

In [1]:
import pandas as pd
%matplotlib inline
import numpy as np

#import stats libraries 
from pyspark.mllib.stat import Statistics
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler

from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('diamonds').getOrCreate()

In [3]:
df = spark.read.csv('diamonds.csv',header=True,inferSchema=True)

In [4]:
df.limit(5).toPandas()

Unnamed: 0,_c0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
#drop the id column
df_no_id = df.drop('_c0')
df_no_na = df_no_id.dropna()

In [6]:
df = df_no_na[['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z','price']]

In [7]:
df = df_no_na.select('price', 'carat', 'cut', 'color', 'clarity', 
  'depth', 'table', 'x', 'y', 'z')
df.show()

+-----+-----+---------+-----+-------+-----+-----+----+----+----+
|price|carat|      cut|color|clarity|depth|table|   x|   y|   z|
+-----+-----+---------+-----+-------+-----+-----+----+----+----+
|  326| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|3.95|3.98|2.43|
|  326| 0.21|  Premium|    E|    SI1| 59.8| 61.0|3.89|3.84|2.31|
|  327| 0.23|     Good|    E|    VS1| 56.9| 65.0|4.05|4.07|2.31|
|  334| 0.29|  Premium|    I|    VS2| 62.4| 58.0| 4.2|4.23|2.63|
|  335| 0.31|     Good|    J|    SI2| 63.3| 58.0|4.34|4.35|2.75|
|  336| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|3.94|3.96|2.48|
|  336| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|3.95|3.98|2.47|
|  337| 0.26|Very Good|    H|    SI1| 61.9| 55.0|4.07|4.11|2.53|
|  337| 0.22|     Fair|    E|    VS2| 65.1| 61.0|3.87|3.78|2.49|
|  338| 0.23|Very Good|    H|    VS1| 59.4| 61.0| 4.0|4.05|2.39|
|  339|  0.3|     Good|    J|    SI1| 64.0| 55.0|4.25|4.28|2.73|
|  340| 0.23|    Ideal|    J|    VS1| 62.8| 56.0|3.93| 3.9|2.46|
|  342| 0.22|  Premium|  

In [8]:
df = df.withColumnRenamed('price', 'label')

In [9]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
cutIndexer = StringIndexer(inputCol='cut', outputCol='cutIndex')
colorIndexer = StringIndexer(inputCol='color', outputCol='colorIndex')
clarityIndexer = StringIndexer(inputCol='clarity', outputCol='clarityIndex')

df = cutIndexer.fit(df).transform(df)
df = colorIndexer.fit(df).transform(df)
df = clarityIndexer.fit(df).transform(df)

df.show()

+-----+-----+---------+-----+-------+-----+-----+----+----+----+--------+----------+------------+
|label|carat|      cut|color|clarity|depth|table|   x|   y|   z|cutIndex|colorIndex|clarityIndex|
+-----+-----+---------+-----+-------+-----+-----+----+----+----+--------+----------+------------+
|  326| 0.23|    Ideal|    E|    SI2| 61.5| 55.0|3.95|3.98|2.43|     0.0|       1.0|         2.0|
|  326| 0.21|  Premium|    E|    SI1| 59.8| 61.0|3.89|3.84|2.31|     1.0|       1.0|         0.0|
|  327| 0.23|     Good|    E|    VS1| 56.9| 65.0|4.05|4.07|2.31|     3.0|       1.0|         3.0|
|  334| 0.29|  Premium|    I|    VS2| 62.4| 58.0| 4.2|4.23|2.63|     1.0|       5.0|         1.0|
|  335| 0.31|     Good|    J|    SI2| 63.3| 58.0|4.34|4.35|2.75|     3.0|       6.0|         2.0|
|  336| 0.24|Very Good|    J|   VVS2| 62.8| 57.0|3.94|3.96|2.48|     2.0|       6.0|         4.0|
|  336| 0.24|Very Good|    I|   VVS1| 62.3| 57.0|3.95|3.98|2.47|     2.0|       5.0|         5.0|
|  337| 0.26|Very Go

In [10]:
from pyspark.ml.feature import OneHotEncoder
OHE = OneHotEncoder(inputCols=['cutIndex', 'colorIndex', 'clarityIndex'],outputCols=['cut_OHE', 'color_OHE', 'clarity_OHE'])

df = OHE.fit(df).transform(df)

In [11]:
assembler = VectorAssembler(
  inputCols= ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_OHE', 'color_OHE', 'clarity_OHE'], outputCol=('features_assem'))

df = df.dropna()

In [12]:
from pyspark.ml.feature import MinMaxScaler

scaler = MinMaxScaler(inputCol="features_assem", outputCol="scaledFeatures")
pipeline = Pipeline(stages=[assembler, scaler])
scalerModel = pipeline.fit(df)
scaled_df = scalerModel.transform(df)
display(scaled_df)

DataFrame[label: int, carat: double, cut: string, color: string, clarity: string, depth: double, table: double, x: double, y: double, z: double, cutIndex: double, colorIndex: double, clarityIndex: double, cut_OHE: vector, color_OHE: vector, clarity_OHE: vector, features_assem: vector, scaledFeatures: vector]

In [14]:
training, test = scaled_df.randomSplit([0.7, 0.3])
training.cache()
test.cache()

DataFrame[label: int, carat: double, cut: string, color: string, clarity: string, depth: double, table: double, x: double, y: double, z: double, cutIndex: double, colorIndex: double, clarityIndex: double, cut_OHE: vector, color_OHE: vector, clarity_OHE: vector, features_assem: vector, scaledFeatures: vector]

In [15]:
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

dt = DecisionTreeRegressor(featuresCol = "scaledFeatures")

pipeline = Pipeline(stages= [dt])

paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15, 20, 30]) \
    .addGrid(dt.maxBins, [10, 20, 30, 50]) \
    .build()

In [16]:
cv = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)
cvModel = cv.fit(training)
predictions = cvModel.transform(test)

In [17]:
evaluator = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "rmse")

rmse = evaluator.evaluate(predictions)

evaluator_r2 = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "r2")

r2 = evaluator_r2.evaluate(predictions)

evaluator_mae = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "mae")

mae = evaluator_mae.evaluate(predictions)

evaluator_mse = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "mse")

mse = evaluator_mse.evaluate(predictions)


print("RMSE on test data = ", rmse)
print("R_squared on test data = ", r2)
print("Mean Absolute Error (MAE) on test data = ", mae)
print("Mean Squared Error (MSE) on test data = ", mse)

predictions.select("label", "prediction").show()

RMSE on test data =  821.8354260237905
R_squared on test data =  0.9581772494419021
Mean Absolute Error (MAE) on test data =  388.1707928297219
Mean Squared Error (MSE) on test data =  675413.4674677053
+-----+------------------+
|label|        prediction|
+-----+------------------+
|  334| 464.9107142857143|
|  337|449.55555555555554|
|  338|             398.8|
|  342|             430.0|
|  345|             326.0|
|  351|431.46153846153845|
|  353|             449.0|
|  353| 468.7769230769231|
|  355|             455.2|
|  357| 394.6666666666667|
|  357|             449.0|
|  361|             496.5|
|  361| 475.6363636363636|
|  362|             601.0|
|  363|             395.0|
|  363|431.46153846153845|
|  364|             364.0|
|  364|             364.0|
|  365| 519.5087719298245|
|  367|             357.0|
+-----+------------------+
only showing top 20 rows



In [18]:
from pyspark.ml.regression import RandomForestRegressor

rf = (RandomForestRegressor()
          .setLabelCol('label')
          .setFeaturesCol('scaledFeatures'))
#stages = [indexers , encoders, assembler_1 , assembler, scaler, rf]
pipeline = Pipeline(stages=[rf])

paramGrid = (ParamGridBuilder()
            .addGrid(rf.maxDepth, [5, 10])
            .addGrid(rf.numTrees, [10, 20])
            .addGrid(rf.maxBins, [10, 20, 30, 50])
            .build())

In [19]:
cv = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(),
                          numFolds=3)
cvModel = cv.fit(training)
predictions = cvModel.transform(test)

In [20]:
evaluator = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "rmse")

rmse = evaluator.evaluate(predictions)

evaluator_r2 = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "r2")

r2 = evaluator_r2.evaluate(predictions)

evaluator_mae = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "mae")

mae = evaluator_mae.evaluate(predictions)

evaluator_mse = RegressionEvaluator(labelCol = "label", predictionCol= "prediction", metricName = "mse")

mse = evaluator_mse.evaluate(predictions)


print("RMSE on test data = ", rmse)
print("R_squared on test data = ", r2)
print("Mean Absolute Error (MAE) on test data = ", mae)
print("Mean Squared Error (MSE) on test data = ", mse)

predictions.select("label", "prediction").show()

RMSE on test data =  887.3150584126815
R_squared on test data =  0.9512473104603819
Mean Absolute Error (MAE) on test data =  460.62632556681945
Mean Squared Error (MSE) on test data =  787328.0128859004
+-----+------------------+
|label|        prediction|
+-----+------------------+
|  334| 544.5657332143484|
|  337|509.88249363273354|
|  338|468.81156365070683|
|  342| 555.1007833832513|
|  345| 581.4184954197772|
|  351| 549.5727379013779|
|  353| 471.1148235527985|
|  353| 593.1545673979541|
|  355| 518.7067738330514|
|  357| 451.2515002784291|
|  357| 475.1445854575604|
|  361|496.34043473862266|
|  361| 542.3912706450188|
|  362| 458.8773323504947|
|  363| 507.8475227717671|
|  363| 567.7956188970599|
|  364| 517.5475116146041|
|  364| 517.5475116146041|
|  365| 596.8092744426718|
|  367|497.91464479845973|
+-----+------------------+
only showing top 20 rows

