In [1]:
%%bash
# Need to install pyspark
# if pyspark is already installed, will print a message indicating pyspark already isntalled
pip install pyspark
pip install seaborn

Collecting pyspark
  Downloading https://files.pythonhosted.org/packages/f0/26/198fc8c0b98580f617cb03cb298c6056587b8f0447e20fa40c5b634ced77/pyspark-3.0.1.tar.gz (204.2MB)
Collecting py4j==0.10.9
  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=f7336ec5213855df9ccd12e17f9856805f2cfe845a563c9a127f76b1de123a54
  Stored in directory: /root/.cache/pip/wheels/5e/bd/07/031766ca628adec8435bb40f0bd83bb676ce65ff4007f8e73f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
%matplotlib inline
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml import feature
import re
from pyspark.sql import functions as fn
from pyspark.sql.functions import col, regexp_replace, split
from pyspark.sql.utils import AnalysisException
from pyspark.sql import Row
from pyspark.sql.types import IntegerType, DateType, FloatType, BooleanType
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

spark = SparkSession \
  .builder \
  .master("local[*]")\
  .config("spark.memory.fraction", 0.8) \
  .config("spark.executor.memory", "12g") \
  .config("spark.driver.memory", "12g")\
  .config("spark.memory.offHeap.enabled",'true')\
  .config("spark.memory.offHeap.size","12g")\
  .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

from google.colab import drive
drive.mount('/content/drive')

from pyspark.sql.functions import col, udf
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import create_map, lit
from itertools import chain
from pyspark.mllib import numpy as np
from pyspark.ml.feature import PCA
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.tuning import ParamGridBuilder

Mounted at /content/drive


In [3]:
new_df = spark.read.format("csv").option("header", "true").option('inferSchema','true').load("drive/Shared drives/IST 718 Group1/therightstuff.csv")

In [4]:
new_df.show(5)

+------------+----+---------------+-----+-------------+-----------------+------------+----------+----------------+-------------------+-----------+-----+-------------+----------------+--------------+-------------+----------------+---------+-------------+------+--------------------+----------+--------------+-----+------+------+---------+--------+------+-----------+-------------+---------+--------------------+----------+---------------+-------+-----------------+-----------+-------+-------+--------------+-------------+------+--------------------+-----------+------------+--------------------+------------+---------+-----+----+-----------+----------+------------+-----------+-------------+-----------+--------------+--------------+------+-----------+------------+----------------+---------+----------------+------------+---------------+------+----+----+-----+---+------+---+---+---+----+----+----+----+----+-----+----+----+----+----+-------+-----+--------+----+--------+----+----+------+---------+--

In [5]:
new_df.describe().show()

+-------+------------------+-----+-----------+-----------+-----------+------------------+------------------+------------------+------------------+-------------------+-----------+--------------+-----------------+------------------+---------+-----------------+--------------------+------------------+--------------------+------------------+------------------+-----------+-------------+------------------+--------------------+----------+------------------+-----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+--------------------+------------+--------------------+------------+------------------+-----------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-----------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+-----------

In [6]:
new_df.printSchema()

root
 |-- back_legroom: double (nullable = true)
 |-- bed: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- cabin: string (nullable = true)
 |-- city: string (nullable = true)
 |-- city_fuel_economy: double (nullable = true)
 |-- daysonmarket: integer (nullable = true)
 |-- dealer_zip: string (nullable = true)
 |-- engine_cylinders: integer (nullable = true)
 |-- engine_displacement: integer (nullable = true)
 |-- engine_type: string (nullable = true)
 |-- fleet: boolean (nullable = true)
 |-- frame_damaged: boolean (nullable = true)
 |-- franchise_dealer: boolean (nullable = true)
 |-- franchise_make: string (nullable = true)
 |-- front_legroom: double (nullable = true)
 |-- fuel_tank_volume: double (nullable = true)
 |-- fuel_type: string (nullable = true)
 |-- has_accidents: boolean (nullable = true)
 |-- height: double (nullable = true)
 |-- highway_fuel_economy: double (nullable = true)
 |-- horsepower: integer (nullable = true)
 |-- interior_color: string (n



---
# Ralph's Modeling in Support of Research Quesitons 2A and 2B (Price Setting and PRice Expectations).


In [7]:
# Create a new copy of the DF to prepare for regression
reg_df = new_df

In [8]:
# Drop select columns
reg_df = reg_df.drop(*['latitude', 'listed_date', 'listing_color', 'longitude','city', 'dealer_zip','interior_color','sp_name','sp_id'])

In [9]:
# Modify booleans to 1's and 0's
reg_df = reg_df.withColumn('HasAccidents',fn.when(col('has_accidents') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('has_accidents')
 
reg_df = reg_df.withColumn('frameDamaged',fn.when(col('frame_damaged') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('frame_damaged')
 
reg_df = reg_df.withColumn('franchiseDealer',fn.when(col('franchise_dealer') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('franchise_dealer')
 
reg_df = reg_df.withColumn('isCabNew',fn.when(col('isCab') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('isCab')
 
reg_df = reg_df.withColumn('isCpo',fn.when(col('is_cpo') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('is_cpo')
 
reg_df = reg_df.withColumn('isNew',fn.when(col('is_new') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('is_new')
 
reg_df = reg_df.withColumn('isOemcpo',fn.when(col('is_oemcpo') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('is_oemcpo')
 
reg_df = reg_df.withColumn('Salvage',fn.when(col('salvage') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('salvage')
 
reg_df = reg_df.withColumn('theftTitle',fn.when(col('theft_title') == 'True', 1).otherwise(0))
reg_df = reg_df.drop('theft_title')

In [10]:
# Create catagories for length of vehicle
reg_df = reg_df.withColumn('length_category',fn.when(col('length') < 176.4, "compact").when(col("length") > 187.2, "fullsize").otherwise('midsize'))

In [11]:
# create catagories for width of vehicle
reg_df = reg_df.withColumn('width_category',fn.when(col('width') < 70.9, "narrow").when(col("width") > 74.8, "wide").otherwise("regular"))

In [12]:
# Create a column that captures teh age of the vehicle which is calculated by teh year, and subtracted from 2020
reg_df = reg_df.withColumn('age',2020 - (col('year')))

In [13]:
reg_df.count()

595365

In [14]:
# Develop logic to scope out new cars 
new_car_logic = (reg_df["isNew"] == 0)
reg_df = reg_df[new_car_logic]

In [15]:
reg_df.count()

303798

In [16]:
from pyspark.ml import feature, Pipeline #import modules/packages for the entire process
# create ordinal pipline for select features
ord_cat_feat_eng_pipe = Pipeline(stages=[ 
                                         feature.StringIndexerModel.from_labels(['compact','midsize','fullsize'],handleInvalid = "keep", inputCol="length_category", outputCol="length_category_idx"),
                                         feature.StringIndexerModel.from_labels(['narrow','regular','wide'],handleInvalid = "keep",inputCol="width_category", outputCol="width_category_idx")])

In [17]:
#Fit and Transform 
reg_df = ord_cat_feat_eng_pipe.fit(reg_df).transform(reg_df)

In [18]:
# Inspect the work
reg_df.show(2)

+------------+----+---------------+-----+-----------------+------------+----------------+-------------------+-----------+-----+--------------+-------------+----------------+---------+------+--------------------+----------+------+--------------------+---------+---------------+-------+-----------------+-----------+-------+--------------+-------------+------------+--------------------+------------+---------+-----+----+-----------+----------+------------+-----------+-------------+-----------+--------------+--------------+------+-----------+------------+----------------+---------+----------------+------------+---------------+------+----+----+-----+---+------+---+---+---+----+----+----+----+----+-----+----+----+----+----+-------+-----+--------+----+--------+----+----+------+---------+----+----+-----+----+------+-----+------+-------+----+----+----+----+----+------+----+------+----+-------+----+-----+----+------------+------------+---------------+--------+-----+-----+--------+----------+------

In [30]:
# Create a Pipeline to transform feature that will be modeled
tot_feat_eng_pipe = Pipeline(stages=[
    feature.VectorAssembler(inputCols=[ "length_category_idx","width_category_idx","n_body_type","n_engine_type","n_fuel_type","n_make_name","n_franchise_make","n_model_name","n_transmission","n_wheel_system",
                                       "age", "city_fuel_economy", "highway_fuel_economy","daysonmarket","engine_cylinders", "horsepower","mileage",
                                        "owner_count", "maximum_seating", "engine_displacement", "torque_ftlb"], outputCol = 'features'),
                                     feature.StandardScaler(withMean = False,inputCol="features", outputCol="scaledFeatures")])

In [31]:
# Fit and transform
reg_df_prepped = tot_feat_eng_pipe.fit(reg_df).transform(reg_df)

In [32]:
# Inspect the work
reg_df_prepped.show(2)

+------------+----+---------------+-----+-----------------+------------+----------------+-------------------+-----------+-----+--------------+-------------+----------------+---------+------+--------------------+----------+------+--------------------+---------+---------------+-------+-----------------+-----------+-------+--------------+-------------+------------+--------------------+------------+---------+-----+----+-----------+----------+------------+-----------+-------------+-----------+--------------+--------------+------+-----------+------------+----------------+---------+----------------+------------+---------------+------+----+----+-----+---+------+---+---+---+----+----+----+----+----+-----+----+----+----+----+-------+-----+--------+----+--------+----+----+------+---------+----+----+-----+----+------+-----+------+-------+----+----+----+----+----+------+----+------+----+-------+----+-----+----+------------+------------+---------------+--------+-----+-----+--------+----------+------

In [33]:
# Set enable grid
enable_grid_search = True

In [34]:
from pyspark.ml import regression, evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

if enable_grid_search:
  
  #Create a Regressor estimator
  lr = regression.LinearRegression(featuresCol='scaledFeatures', labelCol='price')

  #Establish the grid
  paramGrid = ParamGridBuilder()\
    .addGrid(lr.regParam, [0.0, 0.01, 0.02, 0.03, 0.4, 0.7])\
    .addGrid(lr.elasticNetParam, [0.0, 0.2, 0.4, 0.5, 0.7])\
    .build()
    
  # Establish the evaluator
  evaluator = RegressionEvaluator()\
                .setMetricName("mse")\
                .setLabelCol("price")\
                .setPredictionCol("prediction")                

  #Establish the Cross Validation
  cv = CrossValidator()\
    .setEstimator(lr)\
    .setEstimatorParamMaps(paramGrid)\
    .setEvaluator(evaluator)\
    .setNumFolds(3)

  cv_final_model_fitted = cv.fit(reg_df_prepped)

 
  print("The resulting scores from 3-folds cross validation on the data: ", cv_final_model_fitted.avgMetrics)
  print("The lowest score from 3-folds cross validation on the data: ", min(cv_final_model_fitted.avgMetrics))

  pass

The resulting scores from 3-folds cross validation on the data:  [45850607.24333177, 45850607.24333177, 45850607.24333177, 45850607.24333177, 45850607.24333177, 45850607.247499146, 45850536.9501101, 45851138.6483395, 45850646.32272164, 45850463.86284231, 45850607.25222736, 45851138.424835876, 45850523.65511502, 45850662.800147206, 45850177.38018238, 45850607.25751623, 45850290.66692879, 45850017.14383565, 45850196.676829964, 45850520.09308778, 45850607.84716314, 45850460.0427683, 45852422.20554151, 45850543.87468446, 45850592.02999973, 45850608.887844816, 45850691.617569536, 45850602.42648987, 45851181.35898497, 45853927.52949476]
The lowest score from 3-folds cross validation on the data:  45850017.14383565


In [35]:
# Let's extract the param map
cv.extractParamMap()


 Param(parent='CrossValidator_89c864ea8f49', name='estimator', doc='estimator to be cross-validated'): LinearRegression_89b454909de6,
 Param(parent='CrossValidator_89c864ea8f49', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='LinearRegression_89b454909de6', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
   Param(parent='LinearRegression_89b454909de6', name='regParam', doc='regularization parameter (>= 0).'): 0.0},
  {Param(parent='LinearRegression_89b454909de6', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.2,
   Param(parent='LinearRegression_89b454909de6', name='regParam', doc='regularization parameter (>= 0).'): 0.0},
  {Param(parent='LinearRegression_89b454909de6', name='elasticNetParam', doc='the ElasticNet mixing parame

## It turns out the best model had the following hyperparameter values:

elasticNetParam = 0.4

regParam = 0.03

The lowest score from 3-folds cross validation on the data:  63937702.61042934

In [36]:
# Now we'll partition the data
training_reg_df_prepped , testing_reg_df_prepped = reg_df_prepped.randomSplit([0.7, 0.3])

In [37]:
# Inspect the split
[training_reg_df_prepped.count(), testing_reg_df_prepped.count()]

[212271, 91527]

In [38]:
from pyspark.ml import regression, evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

#Create a Regressor estimator
lr_tuned = regression.LinearRegression(featuresCol='scaledFeatures', labelCol='price', elasticNetParam=0.4, regParam=0.03)

# Create the model and preditions
trainModel = lr_tuned.fit(training_reg_df_prepped)

predicitons = trainModel.transform(testing_reg_df_prepped)

# Establish the evaluator
evaluator = RegressionEvaluator()\
  .setMetricName("mse")\
  .setLabelCol("price")\
  .setPredictionCol("prediction") 

TestMSE = evaluator.evaluate(trainModel.transform(testing_reg_df_prepped))

print("Test data MSE :", TestMSE) 

Test data MSE : 45872695.899535485


In [39]:
# Based on what we learned above, lets review some of the important statistics
trainingSummary = trainModel.summary
print("Fit model values")
print("********************")
print("MSE: %f" % round(trainingSummary.meanSquaredError))
print("r2: %f" % trainingSummary.r2)
print("Explained Variance: %f" % round(trainingSummary.explainedVariance))

print("********************")
print("Test data MSE :", round(TestMSE)) 

Fit model values
********************
MSE: 45812726.000000
r2: 0.703556
Explained Variance: 108686859.000000
********************
Test data MSE : 45872696


In [40]:
# Inspect the coeficients
Coefficients = trainModel.coefficients
Coefficients

DenseVector([118.6614, -48.8494, 1217.8965, -1572.1302, 49.1582, 842.823, -515.0671, 666.1964, -370.0792, 1151.5957, -1122.9249, 902.7017, -762.102, -35.4921, 1926.3923, 6580.0107, -5195.0119, -812.0935, -23.8504, -1953.6029, 1481.1041])

Now I'll create a new model and set elasticNetParam = 0 and solver to 0 so I can extract P-values

In [41]:
#Create a Regressor estimator
lr_tuned2 = regression.LinearRegression(featuresCol='scaledFeatures', labelCol='price', elasticNetParam=0, regParam=0.03, solver='normal')

# Create the model and preditions
trainModel2 = lr_tuned2.fit(training_reg_df_prepped)

predicitons2 = trainModel2.transform(testing_reg_df_prepped)

# Establish the evaluator
evaluator2 = RegressionEvaluator()\
  .setMetricName("mse")\
  .setLabelCol("price")\
  .setPredictionCol("prediction") 

TestMSE2 = evaluator2.evaluate(trainModel2.transform(testing_reg_df_prepped))

print("Test data MSE :", TestMSE2) 

Test data MSE : 45872590.7874914


In [42]:
# Based on what we learned above, lets review some of the important statistics
trainingSummary2 = trainModel2.summary
print("Fit model values")
print("********************")
print("MSE: %f" % round(trainingSummary2.meanSquaredError))
print("r2: %f" % trainingSummary2.r2)
print("Explained Variance: %f" % round(trainingSummary2.explainedVariance))

print("********************")
print("Test data MSE :", round(TestMSE2)) 

Fit model values
********************
MSE: 45812245.000000
r2: 0.703559
Explained Variance: 108727983.000000
********************
Test data MSE : 45872591


In [43]:
#Run this custom funciton to provide an organized output
def modelsummary(model):
    import numpy as np
    print ("Note: the last rows are the information for Intercept")
    print ("##","-------------------------------------------------")
    print ("##","  Estimate   |   Std.Error | t Values  |  P-value")
    coef = np.append(list(model.coefficients),model.intercept)
    Summary=model.summary

    for i in range(len(Summary.pValues)):
        print ("##",'{:10.6f}'.format(coef[i]),\
        '{:10.6f}'.format(Summary.coefficientStandardErrors[i]),\
        '{:8.3f}'.format(Summary.tValues[i]),\
        '{:10.6f}'.format(Summary.pValues[i]))

    print ("##",'---')
    print ("##","Mean squared error: % .6f" \
           % Summary.meanSquaredError, ", RMSE: % .6f" \
           % Summary.rootMeanSquaredError )
    print ("##","Multiple R-squared: %f" % Summary.r2, ", \
            Total iterations: %i"% Summary.totalIterations)

In [44]:
# Run the function
modelsummary(trainModel2)

Note: the last rows are the information for Intercept
## -------------------------------------------------
##   Estimate   |   Std.Error | t Values  |  P-value
## 114.064021  20.084593    5.679   0.000000
## -54.205265  21.043883   -2.576   0.010001
## 1216.293878  20.749958   58.617   0.000000
## -1537.987342  32.002283  -48.059   0.000000
##  48.511781  19.486430    2.490   0.012792
## 842.597684  18.485219   45.582   0.000000
## -514.805046  15.961651  -32.253   0.000000
## 664.783735  19.923123   33.367   0.000000
## -367.442060  15.745158  -23.337   0.000000
## 1146.797172  17.298725   66.294   0.000000
## -1120.012182  21.471974  -52.162   0.000000
## 924.757748  53.716351   17.216   0.000000
## -787.293626  55.725991  -14.128   0.000000
## -35.553499  14.895314   -2.387   0.016992
## 1865.647889  45.776519   40.756   0.000000
## 6588.010484  39.429667  167.083   0.000000
## -5198.090631  20.821892 -249.645   0.000000
## -810.217510  18.054482  -44.876   0.000000
## -25.145760  1

In [45]:
# Lets save these to variables
coeffs = trainModel2.coefficients.toArray()
coeffStandError = trainModel2.summary.coefficientStandardErrors
tValues = trainModel2.summary.tValues
pValues = trainModel2.summary.pValues
absCoeffs = abs(trainModel2.coefficients.toArray())

In [46]:
# Create a summary df
summary_df = pd.DataFrame(list(zip([ "length_category_idx","width_category_idx","n_body_type","n_engine_type","n_fuel_type","n_make_name","n_franchise_make","n_model_name","n_transmission","n_wheel_system",
                                       "age", "city_fuel_economy", "highway_fuel_economy","daysonmarket","engine_cylinders", "horsepower","mileage",
                                        "owner_count", "maximum_seating", "engine_displacement", "torque_ftlb"],
                                   coeffs,coeffStandError,tValues,pValues,absCoeffs)),
                          columns = ["Feature", 'Coefficients', 'Coefficient_SE', 'T_Values', "P_Values", "AbsValue_Coeffs" ]).sort_values("AbsValue_Coeffs", ascending = False)

In [47]:
# Review the DF
summary_df

Unnamed: 0,Feature,Coefficients,Coefficient_SE,T_Values,P_Values,AbsValue_Coeffs
15,horsepower,6588.010484,39.429667,167.082578,0.0,6588.010484
16,mileage,-5198.090631,20.821892,-249.64545,0.0,5198.090631
19,engine_displacement,-1924.781024,38.813502,-49.590502,0.0,1924.781024
14,engine_cylinders,1865.647889,45.776519,40.755565,0.0,1865.647889
3,n_engine_type,-1537.987342,32.002283,-48.058676,0.0,1537.987342
20,torque_ftlb,1484.606941,27.802421,53.398478,0.0,1484.606941
2,n_body_type,1216.293878,20.749958,58.616691,0.0,1216.293878
9,n_wheel_system,1146.797172,17.298725,66.293741,0.0,1146.797172
10,age,-1120.012182,21.471974,-52.161585,0.0,1120.012182
11,city_fuel_economy,924.757748,53.716351,17.215573,0.0,924.757748




---
# Now I'll fit a model using PCA


In [48]:
# Create a Pipeline to transform features
pca_feat_eng_pipe = Pipeline(stages=[
    feature.VectorAssembler(inputCols=[ "pc1","pc2","pc3","pc4","pc5","pc6","pc7","pc8","pc9", "pc10","pc11", "pc12",
                                       "pc13", "pc14", "pc15","pc16","pc17","pc18","pc19","pc20","pc21","pc22","pc23", "pc24","pc25",
                                       "pc26","pc27","pc28","pc29","pc25","pc30","pc31","pc32","pc33","pc34","pc35"], outputCol = 'features'),
                                     feature.StandardScaler(withMean = False,inputCol="features", outputCol="scaledFeatures")])

In [49]:
# Fit and transform for PCA
reg_df_prepped_pca = pca_feat_eng_pipe.fit(reg_df).transform(reg_df)

In [58]:
reg_df_prepped_pca.show(2, False)

+------------+----+---------------+-----+-----------------+------------+----------------+-------------------+-----------+-----+--------------+-------------+----------------+---------+------+--------------------+----------+------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+---------------+-------+-----------------+-----------+-------+--------------+-------------+------------+--------------------+------------+---------+-----+----+-----------+----------+------------+-----------+-------------+-----------+--------------+--------------+------+-----------+------------+----------------+---------+----------------+------------+---------------+------+----+----+-----+---+------+---+---+---+----+----+----+----+----+-----+----+----+----+---

In [51]:
# Set enable grid
enable_grid_search = True

In [52]:
from pyspark.ml import regression, evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

if enable_grid_search:
  
  #Create a Regressor estimator
  lr_pca = regression.LinearRegression(featuresCol='scaledFeatures', labelCol='price')

  #Establish the grid
  paramGrid = ParamGridBuilder()\
    .addGrid(lr_pca.regParam, [0.0, 0.01, 0.02, 0.03, 0.4, 0.7])\
    .addGrid(lr_pca.elasticNetParam, [0.0, 0.2, 0.4, 0.5, 0.7])\
    .build()
    
  # Establish the evaluator
  evaluator = RegressionEvaluator()\
                .setMetricName("mse")\
                .setLabelCol("price")\
                .setPredictionCol("prediction")                

  #Establish the Cross Validation
  cv = CrossValidator()\
    .setEstimator(lr_pca)\
    .setEstimatorParamMaps(paramGrid)\
    .setEvaluator(evaluator)\
    .setNumFolds(3)

  cv_final_model_fitted_pca = cv.fit(reg_df_prepped_pca)

 
  print("The resulting scores from 3-folds cross validation on the data: ", cv_final_model_fitted_pca.avgMetrics)
  print("The lowest score from 3-folds cross validation on the data: ", min(cv_final_model_fitted_pca.avgMetrics))
  print("The best model is", cv_final_model_fitted_pca.bestModel)

  pass

The resulting scores from 3-folds cross validation on the data:  [43612662.6041002, 43612662.6041002, 43612662.6041002, 43612662.6041002, 43612662.6041002, 43612671.08993922, 43613044.41068794, 43613071.73616887, 43613071.9584053, 43612714.175368525, 43612671.07850227, 43613071.50514676, 43612769.00886549, 43612034.54705628, 43611996.39532416, 43612671.06786284, 43612791.76957442, 43612063.40047908, 43611879.55416459, 43612876.49999614, 43612671.248065084, 43616206.95297031, 43616014.94558488, 43614062.009351484, 43613001.324314974, 43612672.21330192, 43613790.44200804, 43612957.8805073, 43613476.5388089, 43614121.24901378]
The lowest score from 3-folds cross validation on the data:  43611879.55416459
The best model is LinearRegressionModel: uid=LinearRegression_0ac5c30c04b1, numFeatures=36


In [53]:
# Let's extract the param map
cv.extractParamMap()

 Param(parent='CrossValidator_c706f244d2ce', name='estimator', doc='estimator to be cross-validated'): LinearRegression_0ac5c30c04b1,
 Param(parent='CrossValidator_c706f244d2ce', name='estimatorParamMaps', doc='estimator param maps'): [{Param(parent='LinearRegression_0ac5c30c04b1', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.0,
   Param(parent='LinearRegression_0ac5c30c04b1', name='regParam', doc='regularization parameter (>= 0).'): 0.0},
  {Param(parent='LinearRegression_0ac5c30c04b1', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.2,
   Param(parent='LinearRegression_0ac5c30c04b1', name='regParam', doc='regularization parameter (>= 0).'): 0.0},
  {Param(parent='LinearRegression_0ac5c30c04b1', name='elasticNetParam', doc='the ElasticNet mixing parame

## It turns out the best model had the following hyperparameter values:
elasticNetParam = 0.5

regParam = 0.03

The lowest score from 3-folds cross validation on the data: 64596023


In [54]:
# Now we'll partition the data
training_reg_df_prepped_pca, testing_reg_df_prepped_pca = reg_df_prepped_pca.randomSplit([0.7, 0.3])

In [55]:
# Inspect the split
[training_reg_df_prepped_pca.count(), testing_reg_df_prepped_pca.count()]

[212671, 91127]

Now I'll create a new model and set elasticNetParam = 0 and solver to 0 so I can extract P-values

In [56]:
#Create a Regressor estimator with PCA
lr_pca_tuned2 = regression.LinearRegression(featuresCol='scaledFeatures', labelCol='price', elasticNetParam=0.5, regParam=0.03)

# Create the model and preditions
trainModel_pca = lr_pca_tuned2.fit(training_reg_df_prepped_pca)

predicitons_pca = trainModel_pca.transform(testing_reg_df_prepped_pca)

# Establish the evaluator
evaluator_pca = RegressionEvaluator()\
  .setMetricName("mse")\
  .setLabelCol("price")\
  .setPredictionCol("prediction") 

TestMSE3 = evaluator_pca.evaluate(trainModel_pca.transform(testing_reg_df_prepped_pca))

print("Test data MSE :", TestMSE3) 

Test data MSE : 43829980.91780948


In [57]:
# Based on what we learned above, lets review some of the important statistics
trainModel_pca_Summary = trainModel_pca.summary
print("Fit model values")
print("********************")
print("MSE: %f" % round(trainModel_pca_Summary.meanSquaredError))
print("r2: %f" % trainModel_pca_Summary.r2)
print("Explained Variance: %f" % round(trainModel_pca_Summary.explainedVariance))

print("********************")
print("Test data MSE :", round(TestMSE3)) 

Fit model values
********************
MSE: 43482553.000000
r2: 0.718172
Explained Variance: 110832635.000000
********************
Test data MSE : 43829981
