# Yelp Data set Linear Regression Analysis

## Step: Loading packages and connecting to Spark cluster

In [6]:
#importing spark session
import findspark

findspark.init()

In [3]:
#importing library
import pyspark
from pyspark.sql import SparkSession

In [7]:
#unified entry point to the cluster
spark = SparkSession.builder.\
    getOrCreate()

In [8]:
#SparkContext and SQLContext
sc = spark.sparkContext
sqlContext = spark

In [9]:
# Display information about current execution
spark.conf.get('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES')

'http://ist-s-718-01.syr.edu:8088/proxy/application_1490645795897_0598'

In [10]:
#importing functions
from pyspark.sql import functions as fn

In [11]:
#importing models
from pyspark.ml import feature
from pyspark.ml import regression
from pyspark.ml import Pipeline, PipelineModel

## Step: Convert the dataset into Spark dataframe and perform data cleaning operations

In [14]:
#Loading user reviews data
df_all = spark.read\
    .format('com.databricks.spark.csv')\
    .option('header','true')\
    .option('charset', 'UTF-8')\
    .csv('df_all.csv')

In [19]:
df_all.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- Unnamed: 0: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- name: string (nullable = true)
 |-- business_rating: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- review_rating: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- cool: string (nullable = true)
 |-- compliment_list: string (nullable = true)
 |-- compliment_hot: string (nullable = true)
 |-- compliment_cute: string (nullable = true)
 |-- compliment_note: string (nullable = true)
 |-- compliment_photos: string (nullable = true)
 |-- compliment_more: string (nullable = true)
 |-- compliment_writer: string (nullable = true)
 |-- compliment_plain: string (nullable = true)
 |-- user_avg_rating: string (nullable = true)
 |-- co

In [23]:
df_all.show(5)

+---+----------+--------------------+--------------------+------------------+-------------------+-----------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------------------+------------------+
|_c0|Unnamed: 0|         business_id|          attributes|      review_count|               name|  business_rating|             funny|             user_id|           review_id|                text|       review_rating|              useful|                cool|     compliment_list|      compliment_hot|     compliment_cute|     compliment_note|   compliment_photos|     compliment_more|   compliment_writer|    compliment_plain|     user_avg_rating|  compliment_profile

In [46]:
df_all = df_all.na.drop(subset=['review_id','business_id','user_id','review_rating','user_avg_rating','business_rating','useful','cool','funny','compliment_profile','compliment_cool','compliment_funny','compliment_plain','compliment_writer','compliment_more','compliment_photos','compliment_note','compliment_cute','compliment_list','compliment_hot']) 
# Remove rows with NULL in column 'content
                                
df_ratings = df_all.select('review_id','business_id','user_id','review_rating','user_avg_rating',\
              'business_rating','useful','cool','funny',\
             ((fn.col('compliment_profile')+fn.col('compliment_cool')+fn.col('compliment_funny')+fn.col('compliment_plain')+\
               fn.col('compliment_writer')+fn.col('compliment_more')+fn.col('compliment_photos')+fn.col('compliment_note')+\
               fn.col('compliment_cute')+fn.col('compliment_list')+fn.col('compliment_hot'))/11).alias('compliment_average'))

In [47]:
df_ratings.printSchema()
df_ratings = df_ratings.na.drop(subset=['review_id','business_id','user_id','review_rating','user_avg_rating','business_rating','useful','cool','funny','compliment_average']) 


root
 |-- review_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- review_rating: string (nullable = true)
 |-- user_avg_rating: string (nullable = true)
 |-- business_rating: string (nullable = true)
 |-- useful: string (nullable = true)
 |-- cool: string (nullable = true)
 |-- funny: string (nullable = true)
 |-- compliment_average: double (nullable = true)



In [48]:
df_ratings = df_ratings.withColumn('review_rating',fn.col('review_rating').cast('double'))
df_ratings = df_ratings.withColumn('user_avg_rating',fn.col('user_avg_rating').cast('double'))
df_ratings = df_ratings.withColumn('business_rating',fn.col('business_rating').cast('double'))
df_ratings = df_ratings.withColumn('useful',fn.col('useful').cast('double'))
df_ratings = df_ratings.withColumn('cool',fn.col('cool').cast('double'))
df_ratings = df_ratings.withColumn('funny',fn.col('funny').cast('double'))

In [56]:
df_ratings = df_ratings.na.drop(subset=['review_id','business_id','user_id','review_rating','user_avg_rating','business_rating','useful','cool','funny','compliment_average']) 
df_ratings.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- review_rating: double (nullable = true)
 |-- user_avg_rating: double (nullable = true)
 |-- business_rating: double (nullable = true)
 |-- useful: double (nullable = true)
 |-- cool: double (nullable = true)
 |-- funny: double (nullable = true)
 |-- compliment_average: double (nullable = true)



In [57]:
df_ratings.show(100)

+--------------------+--------------------+--------------------+-------------+---------------+---------------+------+----+-----+-------------------+
|           review_id|         business_id|             user_id|review_rating|user_avg_rating|business_rating|useful|cool|funny| compliment_average|
+--------------------+--------------------+--------------------+-------------+---------------+---------------+------+----+-----+-------------------+
|xqFpx6FkEpjow6JAh...|XqDeiaPSG0-fBbOXD...|efaUGV60LFI4v6bWP...|          5.0|           4.08|            3.5|   1.0| 1.0|  1.0|                0.0|
|fpq7iwxvRdG9vssv0...|zC7ldegnDoXg-Wln5...|efaUGV60LFI4v6bWP...|          5.0|           4.08|            5.0|   0.0| 0.0|  0.0|                0.0|
|jDOVgU7ICogRtpLrX...|O7RMINvCcGVNTMlD7...|efaUGV60LFI4v6bWP...|          5.0|           4.08|            4.5|   0.0| 0.0|  0.0|                0.0|
|NEP1fCrzA7i50c875...|oYwLxROH5RihyFxrd...|hPHsKqUwO_RKJNxkB...|          4.0|           3.33|            

## Step: Split the dataframe into train, validate and test. Apply Linear Regression model

In [58]:
#Creating training,validation and test datasets
data_training, data_validation, data_testing = df_ratings.randomSplit([0.6, 0.3, 0.1], seed=0)

In [59]:
#Vectorizing different variables
va_1 = feature.VectorAssembler(inputCols=[], outputCol = 'features')
va_2 = feature.VectorAssembler(inputCols=['user_avg_rating','business_rating'], outputCol = 'features')
va_3 = feature.VectorAssembler(inputCols=['useful','funny','cool','compliment_average'], outputCol = 'features')
va_4 = feature.VectorAssembler(inputCols=['user_avg_rating','cool','funny','useful','compliment_average'],\
                               outputCol = 'features')
va_5 = feature.VectorAssembler(inputCols=['business_rating','cool','funny','useful','compliment_average'],\
                               outputCol = 'features')
va_6 = feature.VectorAssembler(inputCols=['user_avg_rating'],\
                               outputCol = 'features')
va_7 = feature.VectorAssembler(inputCols=['business_rating'],\
                               outputCol = 'features')
va_8 = feature.VectorAssembler(inputCols=['compliment_average'],\
                               outputCol = 'features')
va_9 = feature.VectorAssembler(inputCols=['user_avg_rating','business_rating','cool','funny','useful','compliment_average'],\
                               outputCol = 'features')

In [60]:
#defining linear regression model
lr = regression.LinearRegression(featuresCol='features', labelCol='review_rating')

In [62]:
#fitting the pipeline on trained dataset
p_1 = Pipeline(stages=[va_1, lr]).fit(data_training)
p_2 = Pipeline(stages=[va_2, lr]).fit(data_training)
p_3 = Pipeline(stages=[va_3, lr]).fit(data_training)
p_4 = Pipeline(stages=[va_4, lr]).fit(data_training)
p_5 = Pipeline(stages=[va_5, lr]).fit(data_training)
p_6 = Pipeline(stages=[va_6, lr]).fit(data_training)
p_7 = Pipeline(stages=[va_7, lr]).fit(data_training)
p_8 = Pipeline(stages=[va_8, lr]).fit(data_training)
p_9 = Pipeline(stages=[va_9, lr]).fit(data_training)

In [63]:
#Defning Mean Square Error varible
MSE = fn.avg((fn.col('prediction') - fn.col('review_rating'))**2)

In [64]:
#checking MSE for each model
#p_1.transform(data_train).select(MSE.alias('p1_train_MSE')).show()
#p_2.transform(data_train).select(MSE.alias('p2_train_MSE')).show()
#p_3.transform(data_train).select(MSE.alias('p3_train_MSE')).show()
#p_4.transform(data_train).select(MSE.alias('p4_train_MSE')).show()
#p_5.transform(data_train).select(MSE.alias('p5_train_MSE')).show()
#p_6.transform(data_train).select(MSE.alias('p6_train_MSE')).show()

In [65]:
#calculating MSE for each model on Validation dataset
p_1.transform(data_validation).select(MSE.alias('p1_val_MSE')).show()
p_2.transform(data_validation).select(MSE.alias('p2_val_MSE')).show()
p_3.transform(data_validation).select(MSE.alias('p3_val_MSE')).show()
p_4.transform(data_validation).select(MSE.alias('p4_val_MSE')).show()
p_5.transform(data_validation).select(MSE.alias('p5_val_MSE')).show()
p_6.transform(data_validation).select(MSE.alias('p6_val_MSE')).show()
p_7.transform(data_validation).select(MSE.alias('p7_val_MSE')).show()
p_8.transform(data_validation).select(MSE.alias('p8_val_MSE')).show()
p_9.transform(data_validation).select(MSE.alias('p9_val_MSE')).show()

#print('p1_val_MSE = ' + str(p1_val_MSE))
#print('p2_val_MSE = ' + str(p2_val_MSE))
#print('p3_val_MSE = ' + str(p3_val_MSE))
#print('p4_val_MSE = ' + str(p4_val_MSE))
#print('p5_val_MSE = ' + str(p5_val_MSE))
#print('p6_val_MSE = ' + str(p6_val_MSE))

+------------------+
|        p1_val_MSE|
+------------------+
|2.4798126440505888|
+------------------+

+------------------+
|        p2_val_MSE|
+------------------+
|1.1929917803273309|
+------------------+

+------------------+
|        p3_val_MSE|
+------------------+
|2.3059850572461795|
+------------------+

+------------------+
|        p4_val_MSE|
+------------------+
|1.5551060127323897|
+------------------+

+-----------------+
|       p5_val_MSE|
+-----------------+
|1.479598526857732|
+-----------------+

+-----------------+
|       p6_val_MSE|
+-----------------+
|1.650384943139921|
+-----------------+

+------------------+
|        p7_val_MSE|
+------------------+
|1.5474097536441018|
+------------------+

+-----------------+
|       p8_val_MSE|
+-----------------+
|2.479770187407096|
+-----------------+

+-----------------+
|       p9_val_MSE|
+-----------------+
|1.143876636061509|
+-----------------+



In [23]:
#Model 9 gas the lowest MSE in validation dataset
#Therefore we will be using model 9 in the testing dataset to fetch the RMSE

In [66]:
RMSE = p_1.transform(data_testing).select(fn.sqrt(MSE).alias('RMSE')).first().RMSE
print('RMSE = ' + str(RMSE))

RMSE = 1.5690181398723912


In [67]:
RMSE = p_9.transform(data_testing).select(fn.sqrt(MSE).alias('RMSE')).first().RMSE
print('RMSE = ' + str(RMSE))

RMSE = 1.0641402017437442


In [None]:
#Now, we predict the value of RMSE on the whole dataset

In [117]:
predicted=p_9.transform(df_ratings).select('review_id','business_id','user_id','review_rating','prediction').show(200)

+--------------------+--------------------+--------------------+-------------+------------------+
|           review_id|         business_id|             user_id|review_rating|        prediction|
+--------------------+--------------------+--------------------+-------------+------------------+
|xqFpx6FkEpjow6JAh...|XqDeiaPSG0-fBbOXD...|efaUGV60LFI4v6bWP...|          5.0|3.9390636747457615|
|fpq7iwxvRdG9vssv0...|zC7ldegnDoXg-Wln5...|efaUGV60LFI4v6bWP...|          5.0| 4.977184823738148|
|jDOVgU7ICogRtpLrX...|O7RMINvCcGVNTMlD7...|efaUGV60LFI4v6bWP...|          5.0| 4.633710670291957|
|NEP1fCrzA7i50c875...|oYwLxROH5RihyFxrd...|hPHsKqUwO_RKJNxkB...|          4.0| 4.115774135614004|
|2bk0aJ_6Tq87K8WaZ...|MCX_cBs_LUqzuUzd-...|YEiJOtOBBRKEpRuvZ...|          4.0| 5.315453188076173|
|SNSNuk3Wj7YDSXLQi...|zOqJqDlWT0CF986D0...|YEiJOtOBBRKEpRuvZ...|          5.0| 4.291093736251701|
|u8o4tuv7o7nnrwWyi...|po-05-AGCVxEme-Sb...|YEiJOtOBBRKEpRuvZ...|          5.0|4.8279471786462995|
|mry4Y8tz87Guv-M1Y..