In [1]:
from operator import add
from pyspark.ml.feature import RegexTokenizer, CountVectorizer
from pyspark.ml.feature import StopWordsRemover, VectorAssembler
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import IDF
from pyspark.ml import Pipeline, PipelineModel

from pyspark.sql.functions import *
from pyspark.sql.types import *

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
#sc = SparkContext(gateway = jg.launch_gateway())

import folium
import html

import pandas as pd
import numpy as np

In [2]:
data_path = '/Volumes/Transcend/dataset/'
model_path = '/Volumes/Transcend/MDS_Yelp/model/'
outout_path = '/Volumes/Transcend/MDS_Yelp/output/'

In [3]:
sc = SparkContext()
#sc = SparkContext('local')
spark = SparkSession(sc)

In [None]:
business_df = spark.read.parquet(data_path + 'business-small.parquet')
business_df.printSchema()

In [5]:
user_df = spark.read.parquet(data_path + 'user-small.parquet')
user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- user_name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [6]:
review_df = spark.read.parquet(data_path + 'review-small.parquet')
review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [7]:
# create SQL view for later queries
review_df.createOrReplaceTempView("reviews")

# create review text dataframe
reviews_text = spark.sql("SELECT user_id, review_text FROM reviews")
reviews_text.show(3)

+--------------------+--------------------+
|             user_id|         review_text|
+--------------------+--------------------+
|u0LXt3Uea_GidxRW1...|Who would have gu...|
|u0LXt3Uea_GidxRW1...|Not bad!! Love th...|
|u0LXt3Uea_GidxRW1...|This is currently...|
+--------------------+--------------------+
only showing top 3 rows



In [8]:
# concatenate all reviews per restuarant

reviews_text_rdd = reviews_text.rdd
reviews_by_user_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_user_df = spark.createDataFrame(reviews_by_user_rdd)
reviews_by_user_df = reviews_by_user_df \
                            .withColumnRenamed('_1', 'user_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_user_df.count()

73041

In [9]:
## Example of using Word2vec
from pyspark.ml.feature import Word2Vec

# Input data: Each row is a bag of words from a sentence or document.
documentDF = spark.createDataFrame([
    ("Hi I heard about Spark".split(" "), ),
    ("I wish Java could use case classes".split(" "), ),
    ("Logistic regression models are neat".split(" "), )
], ["text"])

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="result")
model = word2Vec.fit(documentDF)

result = model.transform(documentDF)
for row in result.collect():
    text, vector = row
    print("Text: [%s] => \nVector: %s\n" % (", ".join(text), str(vector)))

Text: [Hi, I, heard, about, Spark] => 
Vector: [0.02326762331649661,0.008931299671530724,-0.06394885405898094]

Text: [I, wish, Java, could, use, case, classes] => 
Vector: [-0.009175857529044151,-0.024911361613443917,0.012272004171141555]

Text: [Logistic, regression, models, are, neat] => 
Vector: [-0.005160079896450043,-0.0005152661353349686,0.014656295813620091]



In [10]:
## https://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.feature.Word2Vec

## The minimum number of times a token must appear to be included in the word2vec model's vocabulary"

In [None]:
%%time

# create text processing pipeline -- this a lengthy resource-intensive process
# Build the pipeline 
regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')
countVectorizer = CountVectorizer(inputCol="nostopwrd", outputCol="rawFeature")
iDF = IDF(inputCol="rawFeature", outputCol="idf_vec")
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec', seed=123)
vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover, countVectorizer, iDF, word2Vec, vectorAssembler])

# fit the model
pipeline_mdl = pipeline.fit(reviews_by_user_df)

#save the pipeline model
pipeline_mdl.write().overwrite().save(model_path + 'pipe_txt')

In [9]:
# load the text transformation pipeline trained model
pipeline_mdl = PipelineModel.load(model_path + 'pipe_txt')

In [10]:
# transform the review data
reviews_by_user_trf_df = pipeline_mdl.transform(reviews_by_user_df)

In [11]:
# show the transformed review data
reviews_by_user_trf_df.select( 'text', 'nostopwrd', 'idf_vec', 'word_vec', 'comb_vec').show(10)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|                text|           nostopwrd|             idf_vec|            word_vec|            comb_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|A born and bred T...|[born, bred, toro...|(128365,[0,1,2,3,...|[-0.0298230579082...|(128465,[0,1,2,3,...|
|I have never done...|[never, done, one...|(128365,[2,6,11,1...|[-0.0256846508636...|(128465,[2,6,11,1...|
|What a find!  I'm...|[find, m, almost,...|(128365,[2,3,6,11...|[-0.0783052189961...|(128465,[2,3,6,11...|
|Not impressed wit...|[impressed, place...|(128365,[0,1,2,3,...|[-0.0273270801487...|(128465,[0,1,2,3,...|
|Very tight space....|[tight, space, fo...|(128365,[2,39,116...|[-0.0261619807634...|(128465,[2,39,116...|
|While in Toronto ...|[toronto, last, w...|(128365,[0,1,2,3,...|[-0.0836421314076...|(128465,[0,1,2,3,...|
|I loved the decor...|[loved, decor, 

In [12]:
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2))

In [13]:
all_user_vecs = reviews_by_user_trf_df.select('user_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [15]:
type(all_user_vecs)

list

In [16]:
all_user_vecs[0]
# user_id = 'nOTl4aPC4tKHK35T3bNauQ'
# DenseVector() - vector respresentation of all the reviews of User

('BytRWk8X1OelSgwwfXd8Aw',
 DenseVector([-0.0298, -0.0741, -0.0649, 0.0832, 0.0279, -0.0149, -0.0436, -0.0123, -0.0076, -0.0379, -0.0085, -0.0369, -0.0037, -0.0116, 0.0142, -0.0317, -0.0555, -0.0272, -0.0729, 0.0104, -0.0666, -0.0442, 0.0158, 0.0029, 0.0563, -0.0458, -0.0137, -0.0207, -0.0063, 0.0337, 0.0583, 0.0148, 0.0212, -0.0019, -0.031, -0.0436, 0.0617, 0.0352, -0.0151, -0.0433, -0.0319, -0.0212, -0.0246, 0.0944, 0.0214, -0.0364, -0.0363, 0.0447, 0.0029, 0.0008, 0.0107, -0.0229, -0.0219, -0.045, -0.0714, 0.0296, 0.0166, 0.0181, 0.0324, -0.0024, -0.0046, 0.0065, 0.0301, -0.049, -0.0385, 0.0413, 0.0566, 0.0437, -0.0322, 0.0902, -0.0098, -0.0056, 0.0242, 0.0346, 0.0133, -0.0412, -0.0373, 0.0083, -0.0207, 0.0461, -0.0175, -0.0269, 0.0018, -0.0071, -0.0001, 0.0023, 0.0449, 0.0108, 0.0087, 0.0074, 0.013, -0.0001, -0.0442, 0.0266, -0.0158, -0.0175, -0.0496, 0.0423, -0.0899, 0.034]))

In [17]:
def getSimilarUsers(b_ids, sim_user_limit=10):
    
    schema = StructType([   
                            StructField("user_id", StringType(), True)
                            ,StructField("score", IntegerType(), True)
                            ,StructField("input_user_id", StringType(), True)
                        ])
    
    similar_user_df = spark.createDataFrame([], schema)
    similar_user_df_all = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        input_vec = [(r[1]) for r in all_user_vecs if r[0] == b_id][0]
        
        similar_user_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_user_vecs)
        
        similar_user_df = spark.createDataFrame(similar_user_rdd) \
            .withColumnRenamed('_1', 'user_id') \
            .withColumnRenamed('_2', 'score') \
            .orderBy("score", ascending = False)
            
        similar_user_df = similar_user_df.dropna()    
        similar_user_df = similar_user_df.filter(col("user_id") != b_id).limit(sim_user_limit)
        similar_user_df = similar_user_df.withColumn('input_user_id', lit(b_id))
        
        similar_user_df = similar_user_df \
                                    .union(similar_user_df)
        
        similar_user_df_all = similar_user_df_all.union(similar_user_df)
    similar_user_df_all = similar_user_df_all.dropDuplicates()    
    return similar_user_df_all

In [18]:
def getUserDetails(in_bus):
    
    a = in_bus.alias("a")
    b = user_df.alias("b")
    
    return a.join(b, col("a.user_id") == col("b.user_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.user_id'), col('b.user_name'),col('b.review_count')])

In [21]:
# test with two restaurants

uids = ['nOTl4aPC4tKHK35T3bNauQ', 'QBac9-Ii6jR-yLsQ5MVTHg']

print('\ninput user details:')
user_df.select('user_id','user_name', 'review_count') \
    .filter(user_df.user_id.isin(uids) == True).show(truncate=False)
    
# get top 10 similar business
sim_users = getUserDetails(getSimilarUsers(uids))

print('Top 10 similar Users for each input restaurant are:"')
sim_users.select('input_user_id', 'a.user_id', 'user_name', 'score','review_count').toPandas()


input user details:
+----------------------+---------+------------+
|user_id               |user_name|review_count|
+----------------------+---------+------------+
|nOTl4aPC4tKHK35T3bNauQ|Katherine|148         |
|QBac9-Ii6jR-yLsQ5MVTHg|Alex     |13          |
+----------------------+---------+------------+



  


Top 10 similar Users for each input restaurant are:"


Unnamed: 0,input_user_id,user_id,user_name,score,review_count
0,QBac9-Ii6jR-yLsQ5MVTHg,J5Eb7LhJaOa20k0ppcOCOg,Alek,0.907664,34
1,nOTl4aPC4tKHK35T3bNauQ,ZBllYKrFzaI0I7v6Wl26Wg,Cecilia,0.963999,135
2,QBac9-Ii6jR-yLsQ5MVTHg,cNhHuEQMIpLH_qc9qGz67A,Jay,0.910774,57
3,QBac9-Ii6jR-yLsQ5MVTHg,_IR48ok0ZkPMWJ2PlRCk0A,Michael,0.907098,82
4,nOTl4aPC4tKHK35T3bNauQ,myrcQ3h2G04Gv-ANG_oqrg,Linda,0.971381,112
5,QBac9-Ii6jR-yLsQ5MVTHg,MpN81tQOL86GaFse-_tTRQ,Amy,0.913263,46
6,QBac9-Ii6jR-yLsQ5MVTHg,kw-YtOKPXrRB2a9wRZlmzQ,Jimmy,0.915453,101
7,nOTl4aPC4tKHK35T3bNauQ,uO1w3qNo21c1bVHHFTYW0w,Joanne,0.972255,221
8,QBac9-Ii6jR-yLsQ5MVTHg,g5W7s0n19gvT1Ujy_ITJog,Bria,0.907846,73
9,nOTl4aPC4tKHK35T3bNauQ,PGx4HvY5joEeqXzam6tO7A,Lisa,0.965808,349


In [19]:
pd_review_df = review_df.toPandas()

Py4JJavaError: An error occurred while calling o35.collectToPython.
: org.apache.spark.SparkException: Job 30 cancelled because SparkContext was shut down
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:837)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:835)
	at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
	at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.scala:835)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onStop(DAGScheduler.scala:1841)
	at org.apache.spark.util.EventLoop.stop(EventLoop.scala:83)
	at org.apache.spark.scheduler.DAGScheduler.stop(DAGScheduler.scala:1754)
	at org.apache.spark.SparkContext$$anonfun$stop$8.apply$mcV$sp(SparkContext.scala:1931)
	at org.apache.spark.util.Utils$.tryLogNonFatalError(Utils.scala:1360)
	at org.apache.spark.SparkContext.stop(SparkContext.scala:1930)
	at org.apache.spark.SparkContext$$anonfun$2.apply$mcV$sp(SparkContext.scala:573)
	at org.apache.spark.util.SparkShutdownHook.run(ShutdownHookManager.scala:216)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1$$anonfun$apply$mcV$sp$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1991)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply$mcV$sp(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anonfun$runAll$1.apply(ShutdownHookManager.scala:188)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.util.SparkShutdownHookManager.runAll(ShutdownHookManager.scala:188)
	at org.apache.spark.util.SparkShutdownHookManager$$anon$2.run(ShutdownHookManager.scala:178)
	at org.apache.hadoop.util.ShutdownHookManager$1.run(ShutdownHookManager.java:54)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2034)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2055)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2074)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2099)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3195)
	at org.apache.spark.sql.Dataset$$anonfun$collectToPython$1.apply(Dataset.scala:3192)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3254)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3253)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:3192)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)


----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 62137)
Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/anaconda3/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/anaconda3/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/anaconda3/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "/usr/local/Cellar/apache-spark/2.3.1/libexec/python/pyspark/accumulators.py", line 235, in handle
    num_updates = read_int(self.rfile)
  File "/usr/local/Cellar/apache-spark/2.3.1/libexec/python/pyspark/serializers.py", line 685, in read_int
    raise EOFError
EOFError
----------------------------------------


In [21]:
review = pd.read_csv("/Volumes/Transcend/MDS_Yelp/yelp_review.csv")

In [None]:
review.user_id.isin(uids)
review[['user_id', 'business_id', 'stars', 'date']]

In [28]:
uids = ['nOTl4aPC4tKHK35T3bNauQ', 'QBac9-Ii6jR-yLsQ5MVTHg']
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['PGx4HvY5joEeqXzam6tO7A'])]

In [29]:
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')

In [39]:
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,cefRDEK5O3t_iUuwnmL27Q,4,3,We booked a reservation at Smith for Summerlic...,"This place looks small from the outside, but i...",2012-07-12,2016-07-31
1,pOEL97ld-FJMKO8Ki8JmYg,3,5,"Okay, so my rating has gone down from the firs...",I am a big fan of anything Oliver & Bonacini. ...,2013-10-20,2013-01-13
2,EjZYT46Y2qHSoChynd0q-A,4,5,This small gallery is a bit hard to get to at ...,I love this place. I stumbled on it when my b...,2012-11-11,2014-10-03
3,_vZ7bHaGCjllogiZ7RH17w,3,4,My sister told me that Origin was well known f...,4.5 stars \n\nI came here with a girl friend ...,2014-04-02,2015-04-06
4,JPgBO-7imIPdc2XBkvynpQ,4,4,For some reason when I heard the name of this ...,"Huge, huge list of Martinis. Prices aren't ba...",2011-03-21,2015-01-03
5,S5bNE4Pmin8OQUMOFod8bQ,3,2,I actually think this restaurant deserves a 3-...,Came here with a girl friend for dinner on a F...,2011-07-17,2012-10-28
6,uF86ZhygpBEGr3CudNemYA,3,3,When deciding where to go to Valentine's Day d...,I'm terrified of the dark and it took me a lon...,2011-11-20,2012-04-10
7,RkekriZhaIZ7nSJBVept6Q,4,4,I really wanted to be blown away by this place...,I've wanted to try this place for awhile and f...,2013-02-16,2012-09-23


In [27]:
review_uid1

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
97,O4v-1NYH4-t92X-zXBWynw,nOTl4aPC4tKHK35T3bNauQ,5-Gs-cUI_8-3Ykl9KbCLsw,4,2011-05-06,I started going to this nail salon last summer...,15,1,5
98,uRHeiP5iEnvnkN8P3gY-8A,nOTl4aPC4tKHK35T3bNauQ,uN2oZDJGO078ExbbV_DGmA,1,2012-06-03,I give up on this place! I have gotten burrito...,2,1,0
99,BPDhmOxSC4jk-hi9Va47cw,nOTl4aPC4tKHK35T3bNauQ,qbQ1qZx6eTy7pOOXMBfQNQ,2,2011-06-22,I think I may have been let down by this place...,2,0,1
100,QYw2OurOBplkQRCpYExp_w,nOTl4aPC4tKHK35T3bNauQ,XCxxPZ3Lu5mwmIo7IQRf1g,4,2016-01-24,This place is always solid and the customer se...,0,0,0
101,udzzB55YAxWEfVmkcZGYSA,nOTl4aPC4tKHK35T3bNauQ,VTs4f6LnUMHD4ysOezHSUQ,4,2012-07-08,Sometimes it feels SO good to be SO bad...for ...,1,0,0
102,raUmEc0rdycXSRwXSOB5vg,nOTl4aPC4tKHK35T3bNauQ,g-OTq2Jb7FRP7cYDUlFCMg,3,2012-04-15,I have since switched hairdressers and am goin...,3,1,0
103,cd5K0jPd7Gv29x_h1Hhq3Q,nOTl4aPC4tKHK35T3bNauQ,pOvTYClFgMm-wAXPW1CnhQ,2,2016-05-22,"I have eaten at several of these chains, every...",1,0,0
104,V6z_U8AA-35-pp7GMPRDBw,nOTl4aPC4tKHK35T3bNauQ,OwYElCdiJ1IGKVJ4--_mvQ,1,2012-12-04,This whole place basically sucks and it's even...,11,10,3
105,PRj0OFgJjN5ljpeDskP_Tw,nOTl4aPC4tKHK35T3bNauQ,6Q7-wkCPc1KF75jZLOTcMw,2,2012-08-22,"Okay, so from what I am reading, a lot of othe...",3,1,1
106,gWkXKIpYPk0HJGzwZf5bng,nOTl4aPC4tKHK35T3bNauQ,NUHNLaMg7aMesMdAiie73w,3,2013-03-29,OK is sort of the perfect description of this ...,0,0,0


In [40]:
import fastparquet
review_df = pd.read_parquet(data_path + 'review-small.parquet', engine='fastparquet')

In [41]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['vJGLEHyhCs9V-5fAe-xx3w'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,oQylTvXwGIkKFdCjmafKVg,4,4,"We went here after dinner, simply for drinks. ...",We came here for Mother's day as has become tr...,2013-02-10,2013-06-04
1,5N8R7ALESZ30EoAzVJtabw,5,5,Went in on a whim and was not disappointed. I ...,The Dirty Bird invited me back to give them an...,2016-10-17,2015-10-15
2,u2ETlHOcFdRz4BxcdfsK0Q,3,3,We went here for Summerlicious. For $25 we got...,I came here for a ladies lunch with the girls ...,2012-07-08,2013-06-04


In [45]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['ZBllYKrFzaI0I7v6Wl26Wg'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,cefRDEK5O3t_iUuwnmL27Q,4,4,We booked a reservation at Smith for Summerlic...,"I would've given it 5 stars for the food, but ...",2012-07-12,2014-05-05
1,c78Pat78fVUBFPXYeVvbaQ,5,3,I am surprised at the people complaining about...,Overrated. \n\nMistake: coming to Odd Seoul on...,2016-01-24,2016-03-01


In [42]:
review_uid1 = review.loc[review['user_id'].isin(['nOTl4aPC4tKHK35T3bNauQ'])]
review_uid2 = review.loc[review['user_id'].isin(['VVm-TFCpi9M1-k8ED0l1eA'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,zgQHtqX0gqMw1nlBZl2VnQ,2,4,"While I really enjoyed the noodles, their sign...",Actual rating 3.5\n\nLet's get my main gripe o...,2013-10-19,2014-11-22
1,kGOr_D-LNpgZ2M9N8TT4QQ,5,4,I went here with my husband and another couple...,La Societe looks as good as you'd imagine a re...,2013-01-27,2012-01-05
2,W5d8iNog90R-qw43m5dGwg,5,5,"I have been craving schnitzel for a LONG time,...",Had lunch here today and I'm totally sold. Si...,2012-10-08,2012-08-12
3,OIdOJaNS8M624F58XGV3PQ,4,3,"Really this deserves a 3-1\/2 star rating, but...",Actually around 3.5\n\nHaven't been here in qu...,2012-11-11,2014-12-22


In [44]:
review_uid1 = review.loc[review['user_id'].isin(['QBac9-Ii6jR-yLsQ5MVTHg'])]
review_uid2 = review.loc[review['user_id'].isin(['eV5usRjY2cDqNKVv8wXroA'])]
merged_uid = pd.merge(review_uid1, review_uid2, how = 'inner', on = 'business_id')
merged_uid[['business_id', 'stars_x', 'stars_y', 'text_x', 'text_y', 'date_x', 'date_y']] #average the rating if more than 1 review

Unnamed: 0,business_id,stars_x,stars_y,text_x,text_y,date_x,date_y
0,r_BrIgzYcwo1NAuG9dLbpg,4,3,This review will be shorter than my first two....,Total cheap eat hipster restaurant. The food w...,2016-07-05,2016-05-02
