# Part 1: Caching the TOP 1000 restaurant search result HTML

In [1]:
import requests, re, pandas as pd
import time, os, json
from bs4 import BeautifulSoup

TOTAL_RESTAURANT = 1000
base_url = 'https://www.tripadvisor.com/RestaurantSearch'
headers = {
    'User-Agent':  'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
}

REST_LIST_DIR = 'cache/rest_list'
REST_DIR = 'cache/restaurants'
FULL_DATASET_DIR = './restaurant_dataset.csv'


if not os.path.exists(REST_LIST_DIR):
    os.makedirs(REST_LIST_DIR)

for idx in range(0, TOTAL_RESTAURANT, 30):
    if not os.path.exists(REST_LIST_DIR+'/NY_'+str(idx)+'.html'):
        parameter_dictionary = {'Action':'PAGE', 'ajax':'1', 'availSearchEnabled':'false', 'sortOrder':'popularity', 
                                'geo':'60763', 'itags':'10591', 'geobroaden':'false', 'o':'a'+str(idx)}
        response = requests.get(url, parameter_dictionary, headers=headers)
        f = open(REST_LIST_DIR+'/NY_'+str(idx)+'.html', 'w', encoding='utf-8')
        f.write(response.text)
        f.close()
        time.sleep(3)


# Part 2: Extract URLs from cached search result

In [3]:
rest_names = []
rest_urls = []
for idx in range(0, TOTAL_RESTAURANT, 30):
    f = open('cache/rest_list/NY_'+str(idx)+'.html', 'r', encoding='utf-8')
    html = f.read()
    soup = BeautifulSoup(html)
    for rest in soup.body.find_all('a', class_='Lwqic Cj b'):
        rest_names.append(rest.text)
        rest_urls.append("https://www.tripadvisor.com"+str(rest).split('"')[3])
    f.close()
rest_urls

['https://www.tripadvisor.com/Restaurant_Review-g60763-d1236281-Reviews-Club_A_Steakhouse-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d1878682-Reviews-Olio_e_Piu-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d13504265-Reviews-Boucherie_Union_Square-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d11918545-Reviews-Boucherie_West_Village-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d19245342-Reviews-The_Consulate-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d21410713-Reviews-La_Grande_Boucherie-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d479337-Reviews-Bleecker_Street_Pizza-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Restaurant_Review-g60763-d10145683-Reviews-Petite_Boucherie-New_York_City_New_York.html',
 'https://www.tripadvisor.com/Re

# Part 3: Cache restaurant HTMLs based on the URLs

In [42]:
if not os.path.exists(REST_DIR):
    os.makedirs(REST_DIR)
    
for url in rest_urls:
    file_name = url.split('-Reviews-')[1].split('-New_York_City')[0]+'.html'
    if not os.path.exists(REST_DIR+ '/'+ file_name):
        response = requests.get(url, headers=headers)
        f = open(REST_DIR+ '/'+ file_name, 'w', encoding='utf-8')
        f.write(response.text)
        f.close()
        time.sleep(3)

# Part 4: Extract useful information from Restaurant HTMLs and write into CSV

In [121]:
if not os.path.exists(FULL_DATASET_DIR):
    
    url_list = []
    name_list = []
    rating_list = []
    rating_food_list = []
    rating_service_list = []
    rating_value_list = []
    cuisine_sublist_list = []
    review_num_list = []
    neighbor_list = []
    price_level_low_list = []
    price_level_high_list = []
    strong_tag_sublist_list = []
    comments_list = []
    for rest_name in os.listdir(REST_DIR):
        try:

            f = open(REST_DIR + '/' + rest_name, 'r', encoding='utf-8')
            html = f.read()
            soup = BeautifulSoup(html)
            f.close()

            url = str(soup.find_all('link', hreflang="en")[0]).split('"')[1]

            name = soup.find_all('h1', class_='HjBfq')[0].text

            bubble = soup.find_all('svg',class_='UctUV d H0')[0]
            regex = r"(\d+\.\d+) of 5 bubbles"
            match = re.search(regex, str(bubble))
            rating = float(match.group(1))

            bubbles = soup.find_all('span', class_='vzATR')
            regex = r"\d+"
            rating_food = float(re.search(regex, str(bubbles[0].span)).group(0))/10
            rating_service = float(re.search(regex, str(bubbles[1].span)).group(0))/10
            rating_value = float(re.search(regex, str(bubbles[2].span)).group(0))/10

            review_num = int(soup.find_all('span', class_='AfQtZ')[0].text.replace(',', '').split(' ')[0])

            neighbor = soup.find_all('span', class_='yEWoV OkcwQ')[0].div.text

            price_cuisine = soup.find_all('a', class_='dlMOJ')
            price_text = price_cuisine[0].text.replace(' ', '')
            if '-' not in price_text:
                price_level_low = len(price_text)
                price_level_high = len(price_text)
            else:
                splited_price = price_text.split('-')
                price_level_low = len(splited_price[0])
                price_level_high = len(splited_price[1])

            cuisine_sublist = []
            for cuisine in price_cuisine[1:]:
                cuisine_sublist.append(cuisine.text)


            strong_tag_sublist = []
            try:
                for strong_tag in soup.find_all('div', class_='ui_tagcloud_group')[0].find_all('span', class_='ui_tagcloud')[1:]:
                    strong_tag_sublist.append(strong_tag.text[:-1])
            except:
                pass

            comments = ''
            for title in soup.find_all('span', class_='noQuotes'):
                comments = comments + title.text
            for comment in soup.find_all('p', class_='partial_entry'):
                comments = comments + comment.text

            url_list.append(url)
            name_list.append(name)
            rating_list.append(rating)
            rating_food_list.append(rating_food)
            rating_service_list.append(rating_service)
            rating_value_list.append(rating_value)
            cuisine_sublist_list.append(cuisine_sublist)
            review_num_list.append(review_num)
            neighbor_list.append(neighbor)
            price_level_low_list.append(price_level_low)
            price_level_high_list.append(price_level_high)
            strong_tag_sublist_list.append(strong_tag_sublist)
            comments_list.append(comments)

        except:
            print(rest_name+' skipped')
            pass
    
    df_raw = {'Name':name_list, 'url':url_list, 'Rating':rating_list, 'Rating_food':rating_food_list, 'Rating_service':rating_service_list,
          'Rating_value':rating_value_list, 'cuisine':cuisine_sublist_list, 'Review_num':review_num_list, 'Neighborhood':neighbor_list,
          'Price_low':price_level_low_list, 'Price_high':price_level_high_list,
          'Strong_tag':strong_tag_sublist_list, 'Comments':comments_list}
    df = pd.DataFrame(df_raw)
    df.to_csv(FULL_DATASET_DIR)

else:
    print('Restaurant dataset already exists. No need to crawl.')

Restaurant dataset already exists. No need to crawl.


In [5]:
pd.read_csv(FULL_DATASET_DIR, index_col = 0)

Unnamed: 0,Name,url,Rating,Rating_food,Rating_service,Rating_value,cuisine,Review_num,Neighborhood,Price_low,Price_high,Strong_tag,Comments
0,12 Chairs,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Mediterranean', 'Middle Eastern', 'Israeli']",255,Downtown Manhattan (Downtown),2,3,"['brunch', 'hummus', 'shakshuka', 'sabich', 'l...",Delicious Israeli Food 😋Fantastic FindGood foo...
1,15 EAST @ Tocqueville,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['French', 'Japanese', 'Sushi']",451,Union Square,4,4,[],Always a Perfect MealFusion French-Jpanese foo...
2,230 Fifth,https://www.tripadvisor.com/Restaurant_Review-...,4.0,3.5,3.5,3.5,"['American', 'Bar', 'Fusion']",4019,Tenderloin,2,3,"['brunch', 'the empire state building', 'rooft...",Completely Miss soldThe most amazing view!230 ...
3,2nd Avenue Deli,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,3.5,"['Deli', 'Israeli', 'Vegetarian Friendly']",1198,Kips Bay,2,3,"['corned beef', 'matzo ball soup', 'coleslaw',...",Very good but service could be betterA must in...
4,2 Bros Pizza,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.0,5.0,"['Italian', 'American', 'Pizza']",247,Midtown,1,1,"['pizza', 'cheese slices', 'cheap lunch', 'new...",TastyGood cheap lunchJust okay.Best pizza!Deli...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
966,Zabar's,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.0,3.5,4.0,"['American', 'Deli', 'Vegetarian Friendly']",522,Upper West Side,1,1,"['bagels', 'cheese selection', 'smoked salmon'...",Yum to the maxVery TastyOverratedMaybe the bes...
967,Zen Ramen & Sushi,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Japanese', 'Sushi', 'Asian']",164,Tenderloin,2,3,"['ramen', 'chicken', 'visit nyc', 'service was...",Food was not good!Fabulous FoodSo goodAwesome ...
968,Zero Otto Nove,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.0,"['Italian', 'Pizza', 'Southern-Italian']",268,Flatiron District,2,3,"['pasta', 'pie', 'arthur avenue', 'great itali...",What a findGreat Italian foodA gem in Gramercy...
969,Zoob Zib Thai Authentic Noodle Bar,https://www.tripadvisor.com/Restaurant_Review-...,4.5,4.5,4.5,4.5,"['Asian', 'Thai', 'Vegetarian Friendly']",238,Midtown West,2,3,"['pad thai', 'noodle soup', 'grilled pork', 'l...",We’ll worth a visitAuthentic Thai food that wo...
