In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
from tqdm import tqdm
import os
import re
from bs4 import BeautifulSoup
import requests
import time

### ラーメンデータベースの店舗一覧ページをクローリングする

In [None]:
# 「二郎系」タグのページ数を目視で確認したところ、1ページ目から71ページ目まで存在することを確認したので、順番にクローリングしていく
for n in range(1,72):
    url = 'https://ramendb.supleks.jp/search?order=point&type=0&station-id=0&tags=3&page={}'.format(n)
    result = requests.get(url)
    c = result.content
    # ローカルディレクトリにクローリング結果をそのまま格納していく
    with open('htmls/jiro_{}.html'.format(n),'w') as f:
        f.write(c.decode('UTF-8'))
        time.sleep(3)

### スクレイピング1
* 店舗一覧ページの中から、店舗名、レビュースコア、レビュー数、営業ステータス( 営業中・移転・閉店)、店舗URLを取得する

In [None]:
files = sorted(['htmls/'+ x for x in os.listdir('htmls/') if re.search('html$',x)])

In [None]:
# 上記のデータの中にはすでに移転した店舗や閉店済みの店舗も含まれるため、それらを区別する必要がある。
def check_business_status(s):
    """
    営業中・移転・閉店を判定する関数
    """
    if s.find('span',{'class':'status_plate moved'}):
        ret = "移転"
    elif s.find('span',{'class':'status_plate retire'}):
        ret = "閉店"
    else:
        ret = "営業"
    return ret

In [None]:
# スクレイピング結果をpandas dataframeに格納していく
df = pd.DataFrame()

for file in files:
    with open(file,'r') as f:
        soup = BeautifulSoup(f.read(), 'lxml')
    result_dic = dict()
    summary = soup.find('div', {'class':'wrap'})
    for ind, s in enumerate(summary.find_all('li',{'class':'border-box'})):
        result_dic[ind] = dict()
        result_dic[ind]['name'] = s.find('div',{'class':'name'}).find('h4').text
        result_dic[ind]['pref'] = s.find('div',{'class':'area'}).find('a').text
        result_dic[ind]['review_score'] = s.find('div',{'class':'point-val'}).text
        result_dic[ind]['review_num'] = s.find('div',{'class':'val'}).text
        result_dic[ind]['status'] = check_business_status(s)
        result_dic[ind]['url'] = 'https://ramendb.supleks.jp' + s.find('a',{'class':'bglink'}).get('href')
    tmp_df = pd.DataFrame.from_dict(result_dic, orient='index')
    df = pd.concat([df,tmp_df])

In [None]:
display(df.shape)
display(df.head())

In [None]:
df['status'].value_counts()

In [None]:
df[df.status == "営業"].pref.value_counts()

## クローリング2

* 各店舗の住所を取得するために、各店舗のURLをクローリングしてhtmlを取得する

In [None]:
for row in tqdm(df[df['status']=='営業'].itertuples()):
    name = row.name
    url = row.url
    result = requests.get(url)
    c = result.content
    with open('htmls/shop_detail/{}.html'.format(name.replace("/","")),'w') as f:
        f.write(c.decode('UTF-8'))
        time.sleep(1)

### スクレイピング2

* 各店舗のhtmlファイルから住所を取り出す

In [None]:
files = sorted(['htmls/shops/'+ x for x in os.listdir('htmls/shops/')])
file = files[0]
file

In [None]:
result_dic = {}
for file in files:
    with open(file,'r') as f:
        soup = BeautifulSoup(f.read(), 'lxml')
    url = 'https://ramendb.supleks.jp' + soup.find('h1').find('a').get('href')
    result_dic[url] = soup.find('div',{'class':'datas'}).find('span',{'itemprop':'address'}).text.split('このお店は')[0]

In [None]:
# 結果をpandas dataframeに格納していく
address_df = pd.DataFrame.from_dict(result_dic, orient='index').reset_index()
address_df.columns = ['url', 'address']
address_df.head()

In [None]:
# 先ほど作成したdataframeとマージする
active_df = df[df.status == '営業'].reset_index(drop=True)
active_df = active_df.merge(address_df)
active_df.head()

In [None]:
active_df.to_csv("active_shops.csv", index=False)

In [None]:
def coordinate(address: str, dest_url: str='http://www.geocoding.jp/api/') -> tuple(str, str):
    """
    addressに住所を指定すると緯度経度を返す。

    >>> coordinate('東京都文京区本郷7-3-1')
    ['35.712056', '139.762775']
    """
    payload = {'q': address}
    html = requests.get(dest_url, params=payload)
    soup = BeautifulSoup(html.content, "html.parser")
    if not soup.find('lat'):
        print(f"Invalid address submitted. {address}")
        return ('0', '0')
    latitude = soup.find('lat').string
    longitude = soup.find('lng').string
    return (latitude, longitude)

In [None]:
addresses = active_df.address.values

In [None]:
with open('address_lat_lon.tsv', 'w') as f:
    for address in tqdm(addresses):
        lat, lon = coordinate(address)
        f.write(f"{address}\t{lat}\t{lon}\n")
        time.sleep(10)

In [None]:
address_df = pd.read_csv("address_lat_lon.tsv", sep='\t',names=["address","lat","lon"])
print(address_df.shape)
address_df.head()

In [None]:
def clean_address(ad):
    """
    郵便番号、それから住所末尾のビル名を取り除く関数
    """
    ad = re.sub("〒[0-9]{3}\-[0-9]{4} ","",ad)
    ad = ad.split(" ")[0]
    return ad

# 緯度経度取得に失敗した地点の住所をクレンジングする
fail_df = address_df.query("lat == 0")
fail_df.loc[:,"cleansed_address"] = fail_df.address.apply(lambda x:clean_address(x))
fail_df.head()

In [None]:
# 再度APIに投げる
address_fail = fail_df.cleansed_address

with open('address_lat_lon_fail.tsv', 'w') as f:
    for address in tqdm(address_fail):
        lat, lon = coordinate(address)
        f.write(f"{address}\t{lat}\t{lon}\n")
        time.sleep(10)

In [None]:
# 細かいデータ整形
address_df = pd.read_csv("address_lat_lon.tsv", sep='\t',names=["address","lat","lon"])
print(address_df.shape)
display(address_df.head())

fail_df = address_df[address_df.lat == 0]
fail_df.loc[:,"cleansed_address"] = fail_df.address.apply(lambda x:clean_address(x))
print(fail_df.shape)
display(fail_df.head())
fail_result_df = pd.read_csv("address_lat_lon_fail.tsv", sep="\t",names=["cleansed_address","lat","lon"])
fail_df = fail_df[["address","cleansed_address"]].merge(fail_result_df,on="cleansed_address")[["address","lat","lon"]]
print(fail_df.shape)
display(fail_df.head())

In [None]:
merged_address_df = pd.concat([address_df[address_df.lat !=0],fail_df[fail_df.lat !=0]])
print(merged_address_df.shape)
display(merged_address_df.head())

In [None]:
active_df = pd.read_csv("active_shops.csv")
print(active_df.shape)
display(active_df.head())

In [None]:
active_df = active_df.merge(merged_address_df, on="address", how="inner")
active_df.to_csv("active_shops_with_latlon.csv")