This notebook scrapes review comments from Tabelog. It finds the top available shops for a certain category in a certain region with a certain search term(which doesn't seem to actually do anything. 風 is used as a placeholder for the search term. Other types of regions and types of restaurant work in the scraper class, but we're only using prefectures and ramen shops.

In [1]:
from bs4 import BeautifulSoup as bs
import requests
import numpy as np
import re
import pickle
import concurrent.futures
import threading
import time
import pandas as pd
import numpy as np
import nltk

In [11]:
class scraper():
    def __init__(self,region,genre,term):
        self.region=region
        self.term=term
        self.genre=genre
        url=(f'https://tabelog.com/{self.region}/rstLst/{self.genre}/1/?SrtT=rt&sk={self.term}&Srt=D&sort_mode=1')
        response = requests.get(url)
        soup = bs(response.text,'lxml')
        self.shop_count = int(soup.find_all('span',class_='c-page-count__num')[2].find('strong').contents[0])
        self.page_limit = self.shop_count//20
        if self.shop_count%20 != 0: self.page_limit +=1
        self.page_limit = min(60,self.page_limit)
        self.urls=[]
        self.reviews={}
        self.review_file = f'{self.region}_{self.genre}_{self.term}.pickle'
    
    def search(self,num_pages):
        if num_pages > self.page_limit:
            num_pages = self.page_limit
            print(f'Number of pages requested is beyond the number available for {self.term} in {self.region}.')
            print(f'{self.page_limit} pages will be returned, totalling {min(self.shop_count,1200)} shops.')
        for page_number in range(num_pages):
            url=(f'https://tabelog.com/{self.region}/rstLst/{self.genre}/{page_number}/?SrtT=rt&sk={self.term}&Srt=D&sort_mode=1')
            response = requests.get(url)
            soup = bs(response.text,'lxml')
            for h4 in soup.find_all('h4',class_='list-rst__rst-name'):
                self.urls.append(h4.find('a',
                    class_='list-rst__rst-name-target cpy-rst-name js-ranking-num').get('href'))

    def get_reviews(self):
        start_time = time.time()
        for num,url in enumerate(self.urls):
            elapsed = time.time()-start_time
            print(f'After {elapsed}, {num} shops are done.')
            if num != 0: print(f'{elapsed*(len(self.urls)-num)/num} is estimated to remain.')
            self.reviews[num] = scrape_reviews(url)
        with open(self.review_file, 'wb') as to_write:
            pickle.dump((self.reviews,self.urls), to_write)
        to_write.close()

Uncomment prefectures as needed/wanted

In [15]:
prefectures = ['tokyo',
    #'kanagawa','osaka','saitama','chiba','hyogo','fukuoka','shizuoka',
    #'ibaraki','hiroshima','kyoto','niigata','miyagi','tochigi','fukushima',
    #'okayama','kumamoto','kagoshima','yamaguchi','ehime','nagasaki','shiga','nara','aomori',
    #'iwate','oita','ishikawa','yamagata','miyazaki','toyama','akita','wakayama','kagawa','saga',
    'aichi','fukui','yamanashi','hokkaido','nagano','gifu','gunma','mie','okinawa']
    'shizuoka','niigata','toyama','ishikawa',]
    #'tokushima','kochi','shimane','tottori']

In [None]:
for prefecture in prefectures:
    now = scraper(prefecture,'ramen','風')
    now.search(200)
    now.get_reviews()
    del now

In [6]:
thread_local = threading.local()


def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def scrape_reviews(url):
    global reviews
    reviews = []
    #f = open("/media/sam/a97ba62c-d723-4b41-8b12-883a65db419b/tmp.txt", "w")
    #f.write("")
    #f.close()
    reviews_page = url+'/dtlrvwlst/?use_type=0&rvw_part=all&lc=1'
    response = requests.get(reviews_page)
    soup = bs(response.text,'lxml')
    review_links = soup.find_all('a',class_='rvw-item__title-target')
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        executor.map(scrape_page,review_links)
    return reviews.copy()
    #f = open("/media/sam/a97ba62c-d723-4b41-8b12-883a65db419b/tmp.txt", "r")
    #return(f.read()) 
        
def scrape_page(review_link):
    review_response = requests.get('https://www.tabelog.com/'+review_link.get('href'))
    review_soup = bs(review_response.text, 'lxml')
    text_lines = review_soup.find('div',class_='rvw-item__rvw-comment').find('p').contents
    review=''
    for line in text_lines:
        review+=re.sub(r'(<[^>]*>|[[:space:]]|\\n|\.|\/)','',str(line))
    reviews.append(review)

This part was added after the rest and run separately to get more data. In the future, it could probably be part of the same class.

In [None]:
thread_local = threading.local()

identity={}

def get_session():
    if not hasattr(thread_local, "session"):
        thread_local.session = requests.Session()
    return thread_local.session

def identity_scraper(url):
        
    response=requests.get(url)
    soup = bs(response.text,'lxml')
    name = soup.find('h2',class_='display-name').find('span').contents[0].replace('\n',''
        ).replace(' ','')
    rating = float(soup.find('span',class_='rdheader-rating__score-val-dtl'
                    ).contents[0])
    (lat,long)=soup.find_all('img',class_='js-map-lazyload rstinfo-table__map-image'
         )[0].get('data-original').split('&')[4].split('=')[1].split(',')
    identity[url]=[name,rating,lat,long]
prefecture_list = ['aichi','fukui','yamanashi','nagano','gifu',
                   'shizuoka','niigata','toyama','ishikawa']
for prefecture in prefecture_list:
    print(prefecture)
    try:
        with open(f'{prefecture}_ramen_風.pickle', 'rb') as to_read:
            reviews_dict,urls = pickle.load(to_read)
        to_read.close()
    except:
        with open(f'{prefecture}_ramen_風.pickle', 'rb') as to_read:
            urls = pickle.load(to_read)
        to_read.close()
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
        executor.map(identity_scraper,urls)
    with open('identity.pickle', 'wb') as to_write:
        pickle.dump(identity, to_write)
    to_write.close()

    

