In [22]:
# Spark Installation
import findspark
findspark.init()

In [23]:
# importing pandas
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [25]:
# creating spark session
from pyspark.sql.session import SparkSession

spark = SparkSession.builder\
                    .appName("Yelp Analysis")\
                    .getOrCreate()

print(f"This cluster relies on Spark '{spark.version}'")

This cluster relies on Spark '3.2.1'


In [26]:
from  pyspark.sql.functions import input_file_name

# DataFrame creation
Juneyelp = spark.read.json("hdfs://localhost:9000/datalake/raw/tfl/bikepoint/yelp/*")

# The inferred schema can be visualized using the printSchema() method - definitely semi-structured data.
Juneyelp.printSchema()

                                                                                

root
 |-- _corrupt_record: string (nullable = true)
 |-- alias: string (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- alias: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |-- center: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- coordinates: struct (nullable = true)
 |    |-- latitude: double (nullable = true)
 |    |-- longitude: double (nullable = true)
 |-- display_phone: string (nullable = true)
 |-- distance: double (nullable = true)
 |-- id: string (nullable = true)
 |-- image_url: string (nullable = true)
 |-- is_closed: boolean (nullable = true)
 |-- location: struct (nullable = true)
 |    |-- address1: string (nullable = true)
 |    |-- address2: string (nullable = true)
 |    |-- address3: string (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- country: string (nullable = true)
 | 

In [27]:
# selecting the columns that are needed to answer the business questions
Yelpdataframe=Juneyelp.select("categories","name","rating","review_count","transactions","price")

In [28]:
Yelpdataframe.show()

+--------------------+--------------------+------+------------+------------+-----+
|          categories|                name|rating|review_count|transactions|price|
+--------------------+--------------------+------+------------+------------+-----+
|[{tapas, Tapas Ba...|              El Sur|   4.5|         721|          []|   €€|
|[{chocolate, Choc...|Chocolatería San ...|   4.0|        1233|          []|    €|
|[{museums, Museums}]|Museo Nacional de...|   4.5|         413|          []| null|
|[{parks, Parks}, ...|Estanque Grande d...|   4.5|         286|          []| null|
|[{mexican, Mexican}]|     Takos al Pastor|   4.5|         394|          []|    €|
|[{spanish, Spanis...|            Alhambra|   4.0|         443|          []|   €€|
|[{spanish, Spanis...|               Botín|   4.0|         574|          []|  €€€|
|[{tapasmallplates...|       Juana la Loca|   4.5|         208|          []|  €€€|
|[{coffee, Coffee ...|           Toma Café|   4.5|         210|          []|    €|
|[{m

In [29]:
# checking for data type in the columns
Yelpdataframe.dtypes

[('categories', 'array<struct<alias:string,title:string>>'),
 ('name', 'string'),
 ('rating', 'double'),
 ('review_count', 'bigint'),
 ('transactions', 'array<string>'),
 ('price', 'string')]

In [30]:
from  pyspark.sql.functions import col


In [31]:
# first business question- grouping by rating to check how many restaurants are rated 4.0, 4.5 and 5.0
Yelpdataframe.groupBy("rating").count().show()





+------+-----+
|rating|count|
+------+-----+
|   4.5| 1424|
|  null|  178|
|   4.0|  267|
|   5.0|   89|
+------+-----+



                                                                                

In [32]:
# second business question- grouping by price to check the distribution of restaurants based on how expensive they are
Yelpdataframe.groupBy("price").count().show()




+-----+-----+
|price|count|
+-----+-----+
| null|  534|
|   €€|  623|
|  €€€|  267|
|    €|  534|
+-----+-----+



                                                                                

In [33]:
# converting the price column to numerical value
from pyspark.sql.functions import when

df3 = Yelpdataframe.withColumn("price", when(Yelpdataframe.price == "€","1")
      .when(Yelpdataframe.price == "€€","2") \
      .when(Yelpdataframe.price=="€€€","3")
      .otherwise(Yelpdataframe.price))
df3.show()

+--------------------+--------------------+------+------------+------------+-----+
|          categories|                name|rating|review_count|transactions|price|
+--------------------+--------------------+------+------------+------------+-----+
|[{tapas, Tapas Ba...|              El Sur|   4.5|         721|          []|    2|
|[{chocolate, Choc...|Chocolatería San ...|   4.0|        1233|          []|    1|
|[{museums, Museums}]|Museo Nacional de...|   4.5|         413|          []| null|
|[{parks, Parks}, ...|Estanque Grande d...|   4.5|         286|          []| null|
|[{mexican, Mexican}]|     Takos al Pastor|   4.5|         394|          []|    1|
|[{spanish, Spanis...|            Alhambra|   4.0|         443|          []|    2|
|[{spanish, Spanis...|               Botín|   4.0|         574|          []|    3|
|[{tapasmallplates...|       Juana la Loca|   4.5|         208|          []|    3|
|[{coffee, Coffee ...|           Toma Café|   4.5|         210|          []|    1|
|[{m

In [34]:
# changing the data tye of price column to integer
from pyspark.sql.types import IntegerType
df3 = df3.withColumn("price", df3["price"].cast(IntegerType()))
df3.show()


+--------------------+--------------------+------+------------+------------+-----+
|          categories|                name|rating|review_count|transactions|price|
+--------------------+--------------------+------+------------+------------+-----+
|[{tapas, Tapas Ba...|              El Sur|   4.5|         721|          []|    2|
|[{chocolate, Choc...|Chocolatería San ...|   4.0|        1233|          []|    1|
|[{museums, Museums}]|Museo Nacional de...|   4.5|         413|          []| null|
|[{parks, Parks}, ...|Estanque Grande d...|   4.5|         286|          []| null|
|[{mexican, Mexican}]|     Takos al Pastor|   4.5|         394|          []|    1|
|[{spanish, Spanis...|            Alhambra|   4.0|         443|          []|    2|
|[{spanish, Spanis...|               Botín|   4.0|         574|          []|    3|
|[{tapasmallplates...|       Juana la Loca|   4.5|         208|          []|    3|
|[{coffee, Coffee ...|           Toma Café|   4.5|         210|          []|    1|
|[{m

In [35]:
# price has changed to int
df3.dtypes

[('categories', 'array<struct<alias:string,title:string>>'),
 ('name', 'string'),
 ('rating', 'double'),
 ('review_count', 'bigint'),
 ('transactions', 'array<string>'),
 ('price', 'int')]

In [36]:
# correlation between price and rating
df3.corr("price","rating")

0.38545259281249483

In [37]:
# correlation between review count and rating
df3.corr("review_count","rating")

                                                                                

0.23341675055870034