In [67]:
# Imports for data management
import pandas as pd
import json

# Import for crawling and scraping
import requests
from lxml import html
import unidecode
import time
import random

In [109]:

def yelp_biz_urls(key_word, loc, num_results):
    
    
    """
    Getting results from yelp
    The output is a list of business urls
    """
    
    tables = []
    for i in range(0,num_results,10):
        %%time
        time.sleep(random.randint(1, 2) * .931467298)
        url = 'http://www.yelp.com/search?find_desc=%(key_word)s&find_loc=%(loc)s&start=%(num_results)s' % {'key_word':key_word,'loc':loc,'num_results':i}
        page = requests.get(url)
        table = html.fromstring(unidecode.unidecode(page.content))
        for t in table.xpath('//li[@class="regular-search-result"][1]')[0]:
            for page in t.xpath('//li[@class="regular-search-result"]//a[@class="biz-name"]'):
                time.sleep(random.randint(1, 2) * .931467298)
                biz_url = 'http://www.yelp.com%s' % page.attrib['href']
                tables.append(biz_url)
        
    return tables

In [106]:
def yelp_biz_data(biz_urls):
    
    """
    Iterating over yelp urls, scraping data and storing everything to a DataFrame
    """
    

    names = []
    review_count = []
    ratings = []
    prices = []
    review_texts = []
    latitude = []
    longitude = []

    for biz_url in biz_urls:
        time.sleep(random.randint(1, 2) * .931467298)
        biz_page = requests.get(biz_url)
        print biz_url
        table = html.fromstring(unidecode.unidecode(biz_page.content))

        for name in table.xpath('//h1[@itemprop="name"]//text()'):
            names.append(name.strip())

        for review in table.xpath('//div[@class="biz-page-header-left"]//span[@itemprop="reviewCount"]//text()'):
            review_count.append(review.strip())

        for rating in table.xpath('//div[@class="biz-page-header-left"]//meta[@itemprop="ratingValue"]//@content'):
            ratings.append(rating)

        for price in table.xpath('//span[@itemprop="priceRange"]//text()'):
            prices.append(price)
        
        # Each business will get 20 reviews unless you want to do extra crawling
        for review_text in table.xpath('//div[@class="review-list"]'):
            user_review = table.xpath('//p[@itemprop="description"]//text()')      
            review_texts.append(user_review)

        for location in table.xpath('//div[@class="lightbox-map hidden"]'):
            loc_json = location.attrib['data-map-state']
            loc_dict = json.loads(loc_json)
            latitude.append(loc_dict['center']['latitude'])
            longitude.append(loc_dict['center']['longitude'])

        time.sleep(random.randint(1, 2) * .931467298)
     
    # Checking for data length consistency 
    print 'names: %s | review_count: %s | ratings: %s | prices: %s | review_texts: %s | latitude: %s | longitude: %s' % (len(names), len(review_count),len(ratings),len(prices),len(review_texts),len(latitude),len(longitude))

    df = pd.DataFrame({
            'name':names
            ,'review_count':review_count
            ,'rating':ratings
            ,'price':prices
            ,'review_text':review_texts
            ,'latitude':latitude
            ,'longitude':longitude
        })  
    
    return df

In [108]:
data = yelp_data('Restaurants','NYC',30)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.2 µs
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 6.91 µs




In [111]:
df = yelp_biz_data(data)

http://www.yelp.com/biz/upstate-new-york-2
http://www.yelp.com/biz/bensons-nyc-new-york
http://www.yelp.com/biz/traif-brooklyn
http://www.yelp.com/biz/lukes-lobster-new-york-5
http://www.yelp.com/biz/am%C3%A9lie-new-york
http://www.yelp.com/biz/hudson-eats-at-brookfield-place-new-york
http://www.yelp.com/biz/the-greek-new-york
http://www.yelp.com/biz/the-dead-rabbit-new-york
http://www.yelp.com/biz/burger-and-lobster-new-york
http://www.yelp.com/biz/b%C3%B4-c%C3%A0-ph%C3%A9-new-york-5
http://www.yelp.com/biz/breakroom-new-york
http://www.yelp.com/biz/joes-shanghai-new-york-2
http://www.yelp.com/biz/fishmarket-restaurant-new-york
http://www.yelp.com/biz/jane-new-york-2
http://www.yelp.com/biz/fresh-salt-new-york
http://www.yelp.com/biz/masha-and-the-bear-brooklyn
http://www.yelp.com/biz/la-cigogne-brooklyn
http://www.yelp.com/biz/hometown-bar-b-que-brooklyn
http://www.yelp.com/biz/her-name-is-han-new-york-3
http://www.yelp.com/biz/beehive-oven-biscuit-cafe-brooklyn
http://www.yelp.com/b



In [112]:
df

Unnamed: 0,latitude,longitude,name,price,rating,review_count,review_text
0,40.726314,-73.98649,Upstate,$$,4.5,1050,[I love this place! A Their happy hour is from...
1,40.721968,-73.986661,Bensonas NYC,$$,5.0,72,"[Clearly one of the best joints in NYC!, The s..."
2,40.710658,-73.958872,Traif,$$$,4.5,1486,[I've heard about this place a couple of years...
3,40.704402,-74.010989,Lukeas Lobster,$$,4.5,606,"[Well this is simply delicious. , Planning tri..."
4,40.732629,-73.997668,AmA(c)lie,$$,4.5,1104,[Wow. I went to a wine bar in the middle of th...
5,40.711375,-74.015662,Hudson Eats At Brookfield Place,$$,4.5,172,"[Such a cool spot. Indoor, upscale food court ..."
6,40.723714,-74.009617,The Greek,$$,4.5,229,[My boyfriend's parents came into town to visi...
7,40.7033,-74.011029,The Dead Rabbit,$$,4.0,600,"[The cocktail list is a bound book, if that gi..."
8,40.740156,-73.993366,Burger & Lobster,$$,4.5,1162,"[What a great idea, do something extremely wel..."
9,40.721992,-73.997572,BA' CA PhA(c),$$,4.5,53,[This place is delicious. Great bowl of pho an...
