## Getting Started: Analyzing Yelp Data

In [1]:
%%read s3  --as yelp_biz --cache True
path: ps-samples/yelp/businesses/json
format: json

In [5]:
%%sql
SELECT name,city,stars,review_count
FROM yelp_biz
WHERE stars > 3.5
SORT BY review_count DESC 
LIMIT 5

Unnamed: 0,name,city,stars,review_count
0,Mon Ami Gabi,Las Vegas,4.0,4578
1,Earl of Sandwich,Las Vegas,4.5,3984
2,Bouchon Bistro,Las Vegas,4.0,2593
3,Hash House A Go Go,Las Vegas,4.0,2395
4,XS Nightclub,Las Vegas,4.0,2302


In [3]:
%%sql -a nevada_yelp_biz
SELECT state, city, sum(review_count) as totalreviews
FROM yelp_biz
where attributes.`Good For`.dinner = true 
GROUP By state, city
ORDER By totalreviews desc

In [4]:
%%plot bar --data nevada_yelp_biz  --limit 20 
x : city
y : totalreviews
xTitle : city
yTitle : Total Reviews
title : Yelp Reviews Count By City
layout:
    height : 500

### We can do all of the above code as a single line python code too

In [5]:
read(kind="s3", path="ps-samples/yelp/businesses/json", format="json" ,cache=True)\
.groupBy("city")\
.agg(sum("review_count").alias("totalreviews"))\
.sort(desc("totalreviews"))\
.plot(kind="bar", barmode="stack", limit=20, x="city", y=["totalreviews"], 
      xTitle = "City", yTitle="Total Reviews", title="Yelp Reviews Count By City")

In [6]:
table("yelp_biz").groupBy("stars").agg({"*": "count"}).sort("stars").plot(kind="scatter",x="stars",y="count(1)")

### Working with multiple data sources together

In [7]:
%%read s3 --as yelp_user --cache True
path : ps-samples/yelp/users/json 
format: json

In [8]:
%%read s3 --as yelp_review 
path: ps-samples/yelp/reviews/json
format: json

In [9]:
%%sql 
SELECT yelp_biz.business_id, yelp_biz.city, 
       yelp_review.text, yelp_review.stars, yelp_review.date 
FROM yelp_biz 
JOIN yelp_review ON (yelp_biz.business_id = yelp_review.business_id) 
LIMIT 12

Unnamed: 0,business_id,city,text,stars,date
0,-KF9RQPkmIOHfE0tzUu9bg,Scottsdale,I don't normally write reviews for places like...,5,2007-12-18
1,-KF9RQPkmIOHfE0tzUu9bg,Scottsdale,i'm an east-coaster transplanted in southern...,5,2008-08-24
2,-KF9RQPkmIOHfE0tzUu9bg,Scottsdale,Being a snotty coffee loving snot snot I wasn'...,4,2008-09-02


In [10]:
%%sql 
SELECT yelp_biz.name,yelp_biz.city,yelp_biz.review_count,
       yelp_review.text, yelp_review.stars, yelp_review.date, yelp_user.average_stars, 
       yelp_user.review_count, yelp_user.fans
FROM yelp_biz 
JOIN yelp_review ON (yelp_biz.business_id = yelp_review.business_id) 
JOIN yelp_user ON (yelp_user.user_id = yelp_review.user_id)  
LIMIT 3

Unnamed: 0,name,city,review_count,review_count.1,text,stars,date,average_stars,review_count.2,review_count.3,fans
0,Lazy Jane's,Madison,166,6,They keep it simple but they get the basics ri...,4,2014-10-13,2.33,166,6,0
1,Maharaja Restaurant,Madison,111,6,I would rate them on their food if they could ...,1,2014-12-12,2.33,111,6,0
2,Short Stack Eatery,Madison,92,6,Overpriced subpar food. Paid $19 for scrambler...,2,2014-10-12,2.33,92,6,0


In [11]:
registerFunction("hasFourParts", lambda x: len(x.split(" ")) >= 4,  BooleanType())

In [12]:
%%sql
SELECT name,stars,city,review_count 
FROM yelp_biz 
WHERE hasFourParts(name)
SORT BY review_count 
DESC LIMIT 10

Unnamed: 0,name,stars,city,review_count
0,The Buffet at Bellagio,3.5,Las Vegas,2583
1,ARIA Hotel & Casino,3.5,Las Vegas,2440
2,Hash House A Go Go,4.0,Las Vegas,2395
3,Luxor Hotel And Casino Las Vegas,2.5,Las Vegas,2127
4,The Venetian Resort Hotel Casino,4.0,Las Vegas,2079
5,Planet Hollywood Las Vegas Resort & Casino,3.0,Las Vegas,1714
6,Flamingo Las Vegas Hotel & Casino,3.0,Las Vegas,1537
7,Mandalay Bay Resort & Casino,3.5,Las Vegas,1530
8,Phoenix Sky Harbor International Airport,3.5,Phoenix,1512
9,InterContinental Alliance Resorts THE PALAZZO,4.0,Las Vegas,1507


## Machine learning

In [13]:
bin = Binarizer(inputCol = "stars", outputCol = "label", threshold = 3.5)
tok = Tokenizer(inputCol = "text", outputCol = "words")
hashTF = HashingTF(inputCol = tok.getOutputCol(), numFeatures = 10000, outputCol = "features")
lr = LogisticRegression(maxIter = 10, regParam = 0.0001, elasticNetParam = 1.0)
pipeline = Pipeline(stages = [bin, tok, hashTF, lr])

In [14]:
preppedReviews = table("yelp_review").limit(100).map(lambda row: Row(text=row.text, stars=float(row.stars))).toDF()

In [15]:
model = pipeline.fit(preppedReviews)

In [16]:
model.transform(preppedReviews).select("label","prediction","probability","stars","text").plot(limit=20)


weights is deprecated. Use coefficients instead.



Unnamed: 0,label,prediction,probability,stars,text
0,1,0,"[0.86048739935, 0.13951260065]",5,Service: FANTSTIC! Servers were very informati...
1,1,1,"[0.408941427467, 0.591058572533]",4,This restaurant is located in the heart of the...
2,1,1,"[0.0358968159049, 0.964103184095]",4,What a great spot to hit up for lunch if you a...
3,0,0,"[0.582440660773, 0.417559339227]",3,I've eaten at Canaletto's on a couple of diffe...
4,0,1,"[0.00547102157353, 0.994528978426]",3,I don't know how to review this place.....I ha...
5,0,0,"[0.704432844687, 0.295567155313]",3,"We first ate at Canaletto in 2001, when we fir..."
6,0,0,"[0.888756399835, 0.111243600165]",1,"Never again, despite the nice atmosphere.\n\nM..."
7,0,0,"[0.776591234089, 0.223408765911]",2,"Where do I start, I gues by saying that the lo..."
8,0,0,"[0.997199741561, 0.00280025843861]",2,My boyfriend and I were in a rush to find a pl...
9,1,0,"[0.751834347195, 0.248165652805]",4,Great food and got seated right away on a Frid...


## Custom ETL

In [17]:
words = table("yelp_review").limit(1000).select("text").flatMap(lambda row: row.text.split(" ")).map(lambda w: Row(word=w, cnt=1)).toDF()

In [18]:
words.groupBy("word").sum().orderBy(desc("SUM(cnt)")).plot(limit=10)

Unnamed: 0,word,sum(cnt)
0,the,6900
1,and,4671
2,a,3697
3,to,3591
4,,3554
5,I,3377
6,was,2667
7,of,2386
8,for,1690
9,is,1688


## Other Magics

In [19]:
%%tables

Table Name
yelp_biz
yelp_review
yelp_user
nevada_yelp_biz


In [20]:
%%schema -d yelp_biz

In [21]:
%%read redshift
path

Cluster Name,Address,Port,DbName,UserName
bizwh,bizwh.cbkk1zwfxdjp.us-east-1.redshift.amazonaws.com,5439,segment,parastack


In [22]:
%%s3

Bucket
s3a://aws-logs-447716065425-us-east-1
s3a://config-bucket-447716065425
s3a://corp-website
s3a://elasticbeanstalk-us-east-1-447716065425
s3a://mapredtests
s3a://mjcloudtrail
s3a://para-test
s3a://para-usage
s3a://para-web
s3a://parastack
