# **시카고 샌드위치 맛집 분석**
- 웹데이터 가져오기  
     - BeautifulSoup 사용
- 웹페이지에서 원하는 데이터 추출하고 정리하기  
     - pandas,numpy 사용
- 맛집 위치를 지도에 표시하기  
     - folium, googlemaps 사용

### 시카고 샌드위치 맛집 소개 사이트 접근하기

In [1]:
from urllib.request import Request, urlopen

req = Request('https://www.chicagomag.com/chicago-magazine/november-2012/best-sandwiches-chicago/', headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()

In [2]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(html, 'html.parser')

### 접근한 웹 페이지에서 원하는 데이터 추출하고 정리하기

In [3]:
import re

tmp_one = soup.find_all('div','sammy')[0]

tmp_string = tmp_one.find(class_='sammyListing').get_text()

re.split(('\n|\r\n'), tmp_string)

print(re.split(('\n|\r\n'), tmp_string)[0])
print(re.split(('\n|\r\n'), tmp_string)[1])

BLT
Old Oak Tap


In [4]:
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [5]:
from urllib.parse import urljoin
url_base = 'https://www.chicagomag.com'

rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all('div','sammy')

In [6]:
for item in list_soup:
    rank.append(item.find(class_='sammyRank').get_text())
    
    tmp_string = item.find(class_='sammyListing').get_text()
    
    main_menu.append(re.split(('\n|\r\n'), tmp_string)[0])
    cafe_name.append(re.split(('\n|\r\n'), tmp_string)[1])
    
    url_add.append(urljoin(url_base, item.find('a')['href']))

In [7]:
len(rank), len(main_menu), len(cafe_name), len(url_add)

(50, 50, 50, 50)

In [8]:
import pandas as pd

data = {'Rank':rank, 'Menu':main_menu, 'Cafe':cafe_name, 'URL':url_add}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Rank,Menu,Cafe,URL
0,1,BLT,Old Oak Tap,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Fried Bologna,Au Cheval,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Woodland Mushroom,Xoco,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Roast Beef,Al’s Deli,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,PB&L,Publican Quality Meats,https://www.chicagomag.com/Chicago-Magazine/No...


In [9]:
df = pd.DataFrame(data, columns=['Rank','Cafe','Menu','URL'])
df.head(5)

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,https://www.chicagomag.com/Chicago-Magazine/No...
1,2,Au Cheval,Fried Bologna,https://www.chicagomag.com/Chicago-Magazine/No...
2,3,Xoco,Woodland Mushroom,https://www.chicagomag.com/Chicago-Magazine/No...
3,4,Al’s Deli,Roast Beef,https://www.chicagomag.com/Chicago-Magazine/No...
4,5,Publican Quality Meats,PB&L,https://www.chicagomag.com/Chicago-Magazine/No...


In [10]:
df.to_csv('best_sandwiches_list_chicago.csv', sep=',', encoding='UTF-8')

### 50개 가게에 각각 연결된 웹 페이지에 자동으로 접근해서 원하는 정보 가져오기

In [11]:
req = Request(df['URL'][0], headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()

soup_tmp = BeautifulSoup(html, 'html.parser')

In [12]:
print(soup_tmp.find('p','addy'))

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>


In [13]:
price_tmp = soup_tmp.find('p','addy').get_text()
price_tmp

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [14]:
price_tmp.split()

['$10.', '2109', 'W.', 'Chicago', 'Ave.,', '773-772-0406,', 'theoldoaktap.com']

### tqdm모듈 사용하여 상태 진행바 적용 및 진행 정도 확인하기

In [15]:
from tqdm import notebook

price = []
address = []

for n in notebook.tqdm(df.index):
    req = Request(df['URL'][n], headers={'User-Agent': 'Mozilla/5.0'})
    html = urlopen(req).read()

    soup_tmp = BeautifulSoup(html, 'lxml')
    
    gettings = soup_tmp.find('p','addy').get_text()
    
    price.append(gettings.split()[0][:-1])
    address.append(' '.join(gettings.split()[1:-2]))

  0%|          | 0/50 [00:00<?, ?it/s]

### 기존 작업한 DataFrame에 각 가게별 웹페이지에서 가져온 데이터 추가하기

In [16]:
df['Price']=price
df['Address']=address

df = df.loc[:, ['Rank','Cafe','Menu','Price','Address']]
df.set_index('Rank', inplace=True)
df.head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10,"2109 W. Chicago Ave.,"
2,Au Cheval,Fried Bologna,$9,"800 W. Randolph St.,"
3,Xoco,Woodland Mushroom,$9.50,"445 N. Clark St.,"
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston,"
5,Publican Quality Meats,PB&L,$10,"825 W. Fulton Mkt.,"


In [17]:
df.to_csv('best_sandwiches_list_chicago.csv', sep=',', encoding='UTF-8')

### 맛집 위치를 지도에 표기하기

In [19]:
import folium
import googlemaps
import numpy as np
import pandas as pd

In [20]:
gmaps_key = 'AIzaSyA0SdZIwmWi20WCFY0PZR1dxDKK1zv1EzE'
gmaps = googlemaps.Client(key=gmaps_key)

In [21]:
from tqdm import notebook
df = pd.read_csv('best_sandwiches_list_chicago.csv', index_col=0)

In [22]:
lat = []
lng = []

for n in notebook.tqdm(df.index):
    if df['Address'][n] != 'Multiple':
        target_name = df['Address'][n]+', '+'Chicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
        
    else:
        lat.append(np.nan)
        lng.append(np.nan)

  0%|          | 0/50 [00:00<?, ?it/s]

In [23]:
df['lat'] = lat
df['lng'] = lng

df

Unnamed: 0_level_0,Cafe,Menu,Price,Address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Old Oak Tap,BLT,$10,"2109 W. Chicago Ave.,",41.895605,-87.679961
2,Au Cheval,Fried Bologna,$9,"800 W. Randolph St.,",41.884658,-87.647667
3,Xoco,Woodland Mushroom,$9.50,"445 N. Clark St.,",41.890523,-87.630783
4,Al’s Deli,Roast Beef,$9.40,"914 Noyes St., Evanston,",42.058322,-87.683748
5,Publican Quality Meats,PB&L,$10,"825 W. Fulton Mkt.,",41.886604,-87.648536
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walton,41.900087,-87.625337
7,Acadia,Lobster Roll,$16,"1639 S. Wabash Ave.,",41.859054,-87.625201
8,Birchwood Kitchen,Smoked Salmon Salad,$10,"2211 W. North Ave.,",41.910203,-87.682875
9,Cemitas Puebla,Atomica Cemitas,$9,"3619 W. North Ave.,",41.909756,-87.717673
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17,"3267 S. Halsted St.,",41.83453,-87.645649


In [24]:
mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()], zoom_start=11)

for n in df.index:
    if df['Address'][n] != 'Multiple':
        folium.Marker([df['lat'][n], df['lng'][n]], popup=df['Cafe'][n]).add_to(mapping)
        
mapping