In [1]:
# Importing libraries

from pyspark.context import SparkContext
import time
from pyspark.sql.session import SparkSession
from pyspark.ml.feature import StopWordsRemover,Tokenizer, CountVectorizer, Word2Vec, IDF,NGram
from pyspark.ml import Pipeline
import pyspark.sql.functions as f
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# Instantiating the spark cluster

spark = SparkSession.builder.appName("yelp").getOrCreate()

In [4]:
# Reading the dataset
path = "gs://bdl2021_final_project/yelp_train.json"
yelpDF = spark.read.json(path).select('review_id','text','stars')
yelpDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- stars: double (nullable = true)



In [5]:
# Counting number of words

yelpDF.count()

7863924

In [6]:
# Cleaning the text

yelpDF = yelpDF.withColumn("text_raw", f.regexp_replace("text", "[^a-zA-Z0-9\s+\']", ""))
yelpDF = yelpDF.withColumn("text_split", f.split(f.trim("text_raw"),"\s+"))
yelpDF = yelpDF.withColumn("text_clean", f.array_join("text_split", " "))

In [7]:
#Removing stopwords

remover = StopWordsRemover(inputCol="text_split", outputCol="text_new")
yelpDF = remover.transform(yelpDF)

In [8]:
# Exploding words to rows

yelpDF = yelpDF.withColumn("words",f.explode("text_new"))

In [9]:
# Creating new column - positivity of review

yelpDF = yelpDF.withColumn("pos",f.when(f.col("stars")<4,-1).otherwise(1))

In [10]:
# Grouping by positivity

yelpDF_counts = yelpDF.groupBy("words","pos").count()

In [14]:
# Top Word counts for positive reviews

yelpDF_counts.filter(f.col("pos")==1).orderBy("count",ascending=[0]).show(100)

+----------+---+-------+
|     words|pos|  count|
+----------+---+-------+
|     place|  1|2400894|
|      good|  1|2232807|
|     great|  1|2197960|
|      food|  1|2124657|
|      time|  1|1556942|
|   service|  1|1436538|
|      like|  1|1397743|
|       get|  1|1313742|
|      back|  1|1287812|
|       one|  1|1192169|
|    really|  1|1132196|
|        go|  1|1099093|
|      also|  1| 968418|
|    always|  1| 909719|
|      best|  1| 904205|
|      nice|  1| 879759|
|  friendly|  1| 876579|
|      well|  1| 813898|
|     staff|  1| 795331|
|        us|  1| 794262|
| delicious|  1| 788626|
|   amazing|  1| 788065|
|       got|  1| 782416|
|      love|  1| 776702|
|definitely|  1| 738532|
|       try|  1| 697798|
| recommend|  1| 693124|
|    little|  1| 685126|
|      even|  1| 643336|
|restaurant|  1| 641189|
|      come|  1| 626379|
|experience|  1| 615112|
|      made|  1| 614641|
|      menu|  1| 603345|
|   ordered|  1| 587064|
|     Great|  1| 576775|
|     first|  1| 572451|


In [15]:
# Top Word counts for negative reviews

yelpDF_counts.filter(f.col("pos")==-1).orderBy("count",ascending=[0]).show(100)

+----------+---+-------+
|     words|pos|  count|
+----------+---+-------+
|      food| -1|1450760|
|     place| -1|1279943|
|      good| -1|1217210|
|      like| -1|1199459|
|       get| -1|1199301|
|      time| -1|1132565|
|      back| -1|1057516|
|       one| -1|1045730|
|   service| -1|1045225|
|        us| -1| 824406|
|        go| -1| 821691|
|      even| -1| 705435|
|       got| -1| 697469|
|      said| -1| 694942|
|     order| -1| 646182|
|    really| -1| 645767|
|      told| -1| 609534|
|      came| -1| 576162|
|   ordered| -1| 575204|
|     never| -1| 554784|
|     asked| -1| 514411|
|    people| -1| 509772|
|     great| -1| 495156|
|   minutes| -1| 494160|
|      went| -1| 488309|
|      come| -1| 451243|
|      know| -1| 447584|
|      much| -1| 445384|
|     going| -1| 439954|
|restaurant| -1| 437747|
|         2| -1| 434264|
|    better| -1| 432242|
|      also| -1| 421676|
|     first| -1| 410305|
|      nice| -1| 405993|
|experience| -1| 401940|
|       bad| -1| 398495|
