### Spark installation

In [2]:
import findspark
findspark.init()

### importing pandas

In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

### create spark session

In [4]:
from pyspark.sql.session import SparkSession

spark = SparkSession.builder\
                    .appName("Yelp Kaggle")\
                    .getOrCreate()

print(f"This cluster relies on Spark '{spark.version}'")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


This cluster relies on Spark '3.2.1'


In [5]:
from  pyspark.sql.functions import input_file_name

# DataFrame creation
kaggleyelp = spark.read.json("hdfs://localhost:9000/datalake/raw/yelpkaggle")

# The inferred schema can be visualized using the printSchema() method - definitely semi-structured data.
kaggleyelp.printSchema()

                                                                                

root
 |-- address: string (nullable = true)
 |-- attributes: struct (nullable = true)
 |    |-- AcceptsInsurance: string (nullable = true)
 |    |-- AgesAllowed: string (nullable = true)
 |    |-- Alcohol: string (nullable = true)
 |    |-- Ambience: string (nullable = true)
 |    |-- BYOB: string (nullable = true)
 |    |-- BYOBCorkage: string (nullable = true)
 |    |-- BestNights: string (nullable = true)
 |    |-- BikeParking: string (nullable = true)
 |    |-- BusinessAcceptsBitcoin: string (nullable = true)
 |    |-- BusinessAcceptsCreditCards: string (nullable = true)
 |    |-- BusinessParking: string (nullable = true)
 |    |-- ByAppointmentOnly: string (nullable = true)
 |    |-- Caters: string (nullable = true)
 |    |-- CoatCheck: string (nullable = true)
 |    |-- Corkage: string (nullable = true)
 |    |-- DietaryRestrictions: string (nullable = true)
 |    |-- DogsAllowed: string (nullable = true)
 |    |-- DriveThru: string (nullable = true)
 |    |-- GoodForDancing: str

In [6]:
# check all the columns
kaggleyelp.columns

['address',
 'attributes',
 'business_id',
 'categories',
 'city',
 'hours',
 'is_open',
 'latitude',
 'longitude',
 'name',
 'postal_code',
 'review_count',
 'stars',
 'state']

In [7]:
# check the type of the dataset
type(kaggleyelp)

pyspark.sql.dataframe.DataFrame

In [8]:
# select the columns that we need
df= kaggleyelp.select("business_id","city","state","categories","name","stars","review_count")

In [9]:
# a preview of the dataset with the selected columns
df.show()

+--------------------+--------------+-----+--------------------+--------------------+-----+------------+
|         business_id|          city|state|          categories|                name|stars|review_count|
+--------------------+--------------+-----+--------------------+--------------------+-----+------------+
|Pns2l4eNsfO8kk83d...| Santa Barbara|   CA|Doctors, Traditio...|Abby Rappoport, L...|  5.0|           7|
|mpf3x-BjTdTEA3yCZ...|        Affton|   MO|Shipping Centers,...|       The UPS Store|  3.0|          15|
|tUFrWirKiKi_TAnsV...|        Tucson|   AZ|Department Stores...|              Target|  3.5|          22|
|MTSW4McQd7CbVtyjq...|  Philadelphia|   PA|Restaurants, Food...|  St Honore Pastries|  4.0|          80|
|mWMc6_wTdE0EUBKIG...|    Green Lane|   PA|Brewpubs, Breweri...|Perkiomen Valley ...|  4.5|          13|
|CF33F8-E6oudUQ46H...|  Ashland City|   TN|Burgers, Fast Foo...|      Sonic Drive-In|  2.0|           6|
|n_0UpQx1hsNbnPUSl...|     Brentwood|   MO|Sporting Goo

In [10]:
# Business question that we need to answer. 1. Top 10 states with the higest average rating but with review_counts more than 1000
# 2. Top 10 states with the higesht number of businesses listed on yelp
from  pyspark.sql.functions import explode, min, max, avg, sum, count
AVGperstate = df.groupBy("state").agg(avg("stars"),sum("review_count"))



In [11]:
# getting the average rating and count of reviews by state
AVGperstate.show()

[Stage 2:>                                                          (0 + 2) / 2]

+-----+------------------+-----------------+
|state|        avg(stars)|sum(review_count)|
+-----+------------------+-----------------+
|   AZ|3.5920096852300243|           412639|
|   LA| 3.679161628375655|           743176|
|   NJ|3.4591143392689783|           249837|
|   MI|               2.5|                9|
|   NV|3.7368762151652626|           409950|
|   ID|3.7076337586747257|           152086|
|   CA|3.9967326542379396|           339637|
|   MT|               5.0|                6|
|   NC|               2.0|               29|
|   DE|3.3549668874172185|            67370|
|   MO| 3.546091817098873|           483897|
|   IL|3.3696969696969696|            49676|
|   WA|               3.5|               19|
|   IN|3.5882457544234017|           472565|
|   TN| 3.571499668214997|           598195|
|   PA|3.5730191838773173|          1540790|
|   AB| 3.447514803516957|           105477|
|   TX|             2.875|               33|
|   FL|3.6109570831750855|          1119926|
|   CO|   

                                                                                

In [12]:
# selecting the states with more than 1000 ratings
highestaveragestate=AVGperstate.select('state','avg(stars)','sum(review_count)')\
.where(AVGperstate['sum(review_count)'] > 1000)\
.orderBy(AVGperstate['avg(stars)'].desc())
                    

In [13]:
#answer to our first business question
highestaveragestate.show(5)

[Stage 5:>                                                          (0 + 2) / 2]

+-----+------------------+-----------------+
|state|        avg(stars)|sum(review_count)|
+-----+------------------+-----------------+
|   CA|3.9967326542379396|           339637|
|   NV|3.7368762151652626|           409950|
|   ID|3.7076337586747257|           152086|
|   LA| 3.679161628375655|           743176|
|   FL|3.6109570831750855|          1119926|
+-----+------------------+-----------------+
only showing top 5 rows



                                                                                

In [14]:
# now moving to our second business question- state with the higest number of restaurants
higeshstate = df.groupBy("state").agg(count("business_id"))

In [15]:
higeshstate=higeshstate.select('state','count(business_id)')\
.orderBy(higeshstate['count(business_id)'].desc())

In [16]:
# answer to our second business question
higeshstate.show(5)

[Stage 8:>                                                          (0 + 2) / 2]

+-----+------------------+
|state|count(business_id)|
+-----+------------------+
|   PA|             34039|
|   FL|             26330|
|   TN|             12056|
|   IN|             11247|
|   MO|             10913|
+-----+------------------+
only showing top 5 rows



                                                                                

In [19]:
df.write.format("parquet").saveAsTable("yelpdata2")

                                                                                

In [20]:
spark.sql("select * from yelpdata2").show()

+--------------------+--------------+-----+--------------------+--------------------+-----+------------+
|         business_id|          city|state|          categories|                name|stars|review_count|
+--------------------+--------------+-----+--------------------+--------------------+-----+------------+
|Pns2l4eNsfO8kk83d...| Santa Barbara|   CA|Doctors, Traditio...|Abby Rappoport, L...|  5.0|           7|
|mpf3x-BjTdTEA3yCZ...|        Affton|   MO|Shipping Centers,...|       The UPS Store|  3.0|          15|
|tUFrWirKiKi_TAnsV...|        Tucson|   AZ|Department Stores...|              Target|  3.5|          22|
|MTSW4McQd7CbVtyjq...|  Philadelphia|   PA|Restaurants, Food...|  St Honore Pastries|  4.0|          80|
|mWMc6_wTdE0EUBKIG...|    Green Lane|   PA|Brewpubs, Breweri...|Perkiomen Valley ...|  4.5|          13|
|CF33F8-E6oudUQ46H...|  Ashland City|   TN|Burgers, Fast Foo...|      Sonic Drive-In|  2.0|           6|
|n_0UpQx1hsNbnPUSl...|     Brentwood|   MO|Sporting Goo

In [21]:
# using spark SQL we will now get the top rated restaurant in New Orleans with more than 1000 review count
spark.sql("select name,stars from yelpdata2 where city='New Orleans' and review_count>1000 order by stars desc limit 1").toPandas()

Unnamed: 0,name,stars
0,District Donuts Sliders Brew,4.5
