## Importing necessary libs

In [None]:
from pyspark.context import SparkContext
import time
from pyspark.sql.session import SparkSession

In [None]:
spark = SparkSession.builder.appName("yelp").getOrCreate()

In [None]:
from pyspark.ml.feature import StopWordsRemover,Tokenizer, CountVectorizer, Word2Vec, IDF,NGram
from pyspark.ml import Pipeline
import pyspark.sql.functions as f
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Reading data

In [None]:
path = "gs://bdl2021_final_project/yelp_train.json" #Path of the data
yelpDF = spark.read.json(path).select('review_id','text','stars')
yelpDF.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- text: string (nullable = true)
 |-- stars: double (nullable = true)



In [None]:
yelpDF.count()

7863924

## Cleaining the data

In [None]:
yelpDF = yelpDF.withColumn("text_raw", f.regexp_replace("text", "[^a-zA-Z0-9\s+\']", "")) #Removing special chars
yelpDF = yelpDF.withColumn("text_split", f.split(f.trim("text_raw"),"\s+")) #Removing multiple spaces
yelpDF = yelpDF.withColumn("text_clean", f.array_join("text_split", " ")) #Rejoining with single spacecs

In [None]:
remover = StopWordsRemover(inputCol="text_split", outputCol="text_new") #Removing stopwords
yelpDF = remover.transform(yelpDF)

## Finding bigrams

In [None]:
n = NGram(n=2, inputCol="text_new", outputCol="ngrams") 
yelpDF = n.transform(yelpDF) #Finding the bigrams on the text column
yelpDF = yelpDF.withColumn("words",f.explode("ngrams")) #Exploding the rows containing bigram list into multiple rows with one bigram on each row

## Separating bigrams by class

In [None]:
yelpDF = yelpDF.withColumn("pos",f.when(f.col("stars")<4,-1).otherwise(1)) #Creating the positive class

In [None]:
yelpDF_counts = yelpDF.groupBy("words","pos").count()

In [None]:
yelpDF_counts = yelpDF_counts.withColumn("pos_count",f.col('pos')*f.col('count'))

In [None]:
yelpDF_counts = yelpDF_counts.groupBy("words").sum("pos_count")

## Bigrams for Positive Class

In [None]:
st_time = time.time()
yelpDF_counts.select("words","sum(pos_count)").orderBy("sum(pos_count)",ascending=[0]).show(50)
print(time.time()-st_time)

+--------------------+--------------+
|               words|sum(pos_count)|
+--------------------+--------------+
|    highly recommend|        194890|
|         really good|        116380|
|     definitely back|         94015|
|           Las Vegas|         91616|
|       great service|         85334|
|          first time|         83969|
|            one best|         81836|
|           ice cream|         80065|
|      staff friendly|         77195|
|          food great|         77191|
|          love place|         75198|
|         great place|         74851|
|       service great|         74068|
|    Highly recommend|         70988|
|           come back|         70942|
|definitely recommend|         67574|
|          great food|         67257|
|             5 stars|         66628|
|           next time|         66076|
|    great experience|         61424|
|      super friendly|         59551|
|     recommend place|         57524|
|           great job|         57487|
|           

## Bigrams for Negetive class

In [None]:
st_time = time.time()
yelpDF_counts.select("words","sum(pos_count)").orderBy("sum(pos_count)",ascending=[1]).show(50)
print(time.time()-st_time)

+----------------+--------------+
|           words|sum(pos_count)|
+----------------+--------------+
|     tasted like|        -39397|
|      20 minutes|        -39066|
|      10 minutes|        -38397|
|      15 minutes|        -34945|
|   minutes later|        -32226|
|  somewhere else|        -31669|
|       call back|        -30697|
|      waste time|        -30105|
|         told us|        -29498|
|       came back|        -29418|
|         3 stars|        -28774|
|      30 minutes|        -28575|
|customer service|        -28330|
|         2 stars|        -27162|
|     looked like|        -25559|
| nothing special|        -25164|
|     credit card|        -24546|
|        never go|        -24116|
|      front desk|        -23856|
|          1 star|        -20707|
|        one star|        -19880|
|      45 minutes|        -19119|
|     minutes get|        -18619|
|       last time|        -18605|
|    someone else|        -17644|
|      money back|        -16769|
|      never c

In [None]:
yelpDF_counts.select("words","sum(pos_count)").write.format("csv").save("gs://model-bucket-bdl/word_counts.csv")