# Get Soup

In [100]:
from bs4 import BeautifulSoup
import urllib.request
import re
import json

In [5]:
# my Yelp reviews
r = urllib.request.urlopen('https://www.yelp.com/syndicate/user/dMPdDtCgj8N_fKOp_YOXhg/rss.xml').read()
soup = BeautifulSoup(r, "html.parser")

In [6]:
print(soup.prettify()[0:2000])

<?xml version="1.0" encoding="utf-8"?>
<?xml-stylesheet href="https://s3-media4.fl.yelpcdn.com/assets/2/www/css/149dc283e287/rss.css" type="text/css" media="screen"?>
<rss version="2.0" xmlns:geo="http://www.w3.org/2003/01/geo/wgs84_pos#">
 <channel>
  <copyright>
   Copyright 2016 Yelp, Inc. All rights reserved.
  </copyright>
  <title>
   Michael Z. on Yelp
  </title>
  <link>
   http://www.yelp.com/user_details?userid=dMPdDtCgj8N_fKOp_YOXhg
  </link>
  <description>
   Yelp - Michael's most recent reviews on Yelp.
  </description>
  <item>
   <pubdate>
    Fri, 02 Sep 2016 09:40:11  PST
   </pubdate>
   <title>
    Chicas Tacos (3/5) on Yelp
   </title>
   <link>
    https://www.yelp.com/biz/chicas-tacos-los-angeles?hrid=onffxHXKezg4g_xM2x-EHQ
   </link>
   <description>
    Upon returning to LA, I noticed that there was a new taco place nearby, so I had to check it out. My brother warned me that it wouldn't be worth it, but I was pretty optimistic.

The setup of the…
   </descripti

# Scrape and collect relevant information

### Test

In [59]:
reviews = soup.find_all("item")
# should show 10 at once
print(len(reviews))

10


In [62]:
# extracting name and stars from title (test)
test_title = str(reviews[0].title)

match = re.search(r"<title>(?P<name>[^(]*) \((?P<rating>\d)/\d\) on Yelp</title>", test_title)

if match:
    print("Restaurant: {}, Rating: {}/5".format(match.group("name"),
                                                match.group("rating")))
    
test_name = match.group("name")
print(test_name)
test_rating = match.group("rating")
print(test_rating)

Restaurant: Chicas Tacos, Rating: 3/5
Chicas Tacos
3


In [87]:
# extracting other information (review link, coordinates)
test_link = reviews[0].find("link").getText()
test_long = reviews[0].find("geo:long").getText()
test_lat = reviews[0].find("geo:lat").getText()
print(test_link)
print(test_long)
print(test_lat)

https://www.yelp.com/biz/chicas-tacos-los-angeles?hrid=onffxHXKezg4g_xM2x-EHQ
-118.256342201
34.0456597932


### Execute on RSS feed

In [89]:
# make a restaurant class
class Restaurant:
    def __init__(self, name, stars, link, long, lat):
        self.name = name
        self.stars = stars
        self.link = link
        self.long = long
        self.lat = lat

title_pattern = r"<title>(?P<name>[^(]*) \((?P<stars>\d)/\d\) on Yelp</title>"

restaurants = []
# list of restaurants with relevant elements
for review in reviews:
    title = str(review.title)
    name_stars = re.search(title_pattern, title)
    name = name_stars.group("name")
    stars = name_stars.group("stars")
    link = review.find("link").getText()
    long = review.find("geo:long").getText()
    lat = review.find("geo:lat").getText()
    new = Restaurant(name, stars, link, long, lat)
    restaurants.append(new)

# Export (as JSON)

In [101]:
# json_string = json.dumps([ob.__dict__ for ob in restaurants])
# print(json_string)

In [104]:
with open('restaurants.json', 'w') as output:
    json.dump([ob.__dict__ for ob in restaurants], output)