In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd


In [2]:
def find_cuisine(soup):
    """
    Takes a tripadvisor url and returns a list of types of cuisine that restaurant serves
    
    """
    cuisine_list = []
    
    #soup.prettify
    text = soup.text
    
    #not a very clean way of doing this, this object seems to be used on all tripadvisor review pages
    #the below find this in the html and splits it to give a list of cuisines
    cuisine_type = text.split("ta.common.dmp.store.setBehaviors")[1].split(";")[0].split("CuisineType")[1].split(",")
    
    for i in cuisine_type:
        #print(re.sub(r'[^\w]','',i))
        #loops throught the cuisines and cleans up any formating left over from the html
        #then appends the clean cuisine tag
        cuisine_list.append(re.sub(r'[^\w]','',i))
    
    return cuisine_list

def find_rating(soup):
    """
    Takes a tripadvisor url and returns a list of types of cuisine that restaurant serves
    
    """
    
    
    #soup.prettify
    text = soup.text
    
    #not a very clean way of doing this, this object seems to be used on all tripadvisor review pages
    #the below find this in the html and splits it to give a list of cuisines
    rating = text.split("ta.common.dmp.store.setBehaviors")[1].split(";")[0].split("RestaurantRating")[1].split(",")[0]
    rating = float(rating.split('[')[1].split('\n')[1].split('"')[1])
    #rating = rating.split('\n')
    
    
    return rating

def find_price(soup):
    
    prices = soup.text.split('Average prices')[1].split('Cuisine')[0]
    prices = re.sub('\n',"",prices)
    prices = re.sub('£','',prices)
    prices = re.sub('€','',prices)
    prices = re.sub(' ','',prices)
    prices = prices.split('-')    
    
    return prices




def find_ta_url(restaurant, location):
    """
    takes in a restaurant name and location which is then used as a search term on 
    tripadvisor and uses the find cuisine function to return a non duplicating list
    of cuisines.
    
    """
    
    search_terms = str(restaurant+'+'+location+'tripadvisor')
    google_url = 'http://www.google.com/search'
    #print(search_terms)
    
    cuisine_list =[]
    rating = np.nan
    prices = np.nan
    
    #Parameters in payload
    payload = { 'q' : search_terms, 'start' : '0' }
 
    #Setting User-Agent
    my_headers = { 'User-agent' : 'Mozilla/11.0' }
 
    #Getting the response in an Object r
    r = requests.get( google_url, params = payload, headers = my_headers )
 
    google_soup = BeautifulSoup(r.content,"lxml")
    
    tags = google_soup.find_all('h3',class_='r')
    
    resaurant_found = False
    
    for i in tags:
    
        if resaurant_found == False:
            
            try:
                url = re.search('url\?q=(.+?)\&sa',i.a['href']).group(1)
                if "tripadvisor" in url:
                    response = requests.get(url)
                    soup = BeautifulSoup(response.content,"lxml")
                    
                    
                    cuisine = find_cuisine(soup)
                   # print(url)
                    for item in cuisine:
                        if item not in cuisine_list:
                            cuisine_list.append(item)
                            
                    resaurant_found = True
                            
                    temp_rating = find_rating(soup)
                    
                    if isinstance(temp_rating,float):
                        rating = temp_rating
                    
                    prices = find_price(soup)
                    
                    
                
            except:
                #print("fail")
                continue
    
    return cuisine_list , rating , prices,url
#End

In [3]:
find_ta_url('sooty olive','londonderry')

(['Irish', 'European', 'British'],
 4.5,
 ['17'],
 'https://www.tripadvisor.co.uk/Restaurant_Review-g186482-d2213902-Reviews-The_Sooty_Olive_Restaurant-Derry_County_Londonderry_Northern_Ireland.html')

In [4]:
category = pd.read_csv('/Users/Luke/Downloads/restaurants.csv',names=['restaurant_name','address_1','address_2',
                                                                      'address_3','address_4','address_5'])
category["cuisine"]=''
category["rating"]=np.nan
category['price_min']=np.nan
category['price_max']=np.nan
category['ta_url']=""

In [None]:
for index, value in category['restaurant_name'].iteritems():
    try:
        cuisine,rating,price,url = find_ta_url(str(value),category['address_3'][index])
        category.set_value(index,'ta_url',url)
        category.set_value(index,'cuisine',(cuisine[0]))
        category.set_value(index,'rating',rating)
        category.set_value(index,'price_min',(price[0]))
        category.set_value(index,'price_max',(price[1]))
        
    
    except:
        continue