# 웹 크롤링 1/2
### 웹 페이지에서 필요한 정보 파싱
- The 50 Best Sandwiches in Chicago
- 리스트 가져오기
- 각각에 연결된 가격과 주소정보 가져오기
- 가게들의 주소를 지도에 맵핑

### Beautiful Soup : 웹페이지를 읽어오는 가장 보편적인 패키지

In [91]:
from bs4 import BeautifulSoup 
from urllib.request import Request, urlopen

In [92]:
# 구글에서 '50 Best Sandwiches' 검색
url = 'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'
# html = urlopen(url)
# soup = BeautifulSoup(html, "lxml")

In [93]:
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
soup = BeautifulSoup(html, "lxml")

In [94]:
html_title = soup.title
html_title

<title>The 50 Best Sandwiches in Chicago – Chicago Magazine</title>

In [95]:
tag_name = soup.title.name
tag_name

'title'

In [96]:
p_tag_name = soup.title.parent.name
p_tag_name

'head'

In [97]:
tag_text = soup.title.string
tag_text

'The 50 Best Sandwiches in Chicago – Chicago Magazine'

In [98]:
soup.title.getText()

'The 50 Best Sandwiches in Chicago – Chicago Magazine'

In [99]:
soup.title.get_text()

'The 50 Best Sandwiches in Chicago – Chicago Magazine'

In [100]:
print(tag_text)

The 50 Best Sandwiches in Chicago – Chicago Magazine


In [101]:
soup.div

<div class="grid-toggle-wrap"><a class="grid-toggle">.</a></div>

In [102]:
div_tags = soup.find_all('div')

In [103]:
len(div_tags)

258

In [104]:
div_tags[0]

<div class="grid-toggle-wrap"><a class="grid-toggle">.</a></div>

In [105]:
type(div_tags)

bs4.element.ResultSet

In [106]:
# print(soup.find_all('div', 'sammy'))

In [107]:
len(soup.find_all('div', 'sammy'))

50

In [108]:
# print(soup.find_all('div', 'sammyListing'))

In [109]:
# print(soup.find_all('div', 'sammyRank'))

In [110]:
tmp = soup.find_all('div', 'sammyRank')
tmp[0].get_text()

'1'

In [111]:
listOfsoup = soup.find_all('div', 'sammy')
listOfsoup[0]

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [112]:
###
item.find('div', 'sammyRank')

<div class="sammyRank">50</div>

In [113]:
rank = []

for item in listOfsoup:
    rank.append(item.find('div', 'sammyRank').get_text())
    
rank[:10]

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10']

In [114]:
listOfsoup[1].find("a").get_text()

'Fried Bologna\nAu Cheval\nRead more '

In [115]:
listOfsoup[0].find("a")['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

### 정규식 : Regular Expresion 모듈

In [34]:
import re

In [67]:
tmpString = str(listOfsoup[0].find("a").get_text())
tmpString

'BLT\nOld Oak Tap\nRead more '

### # \n or \r\n 으로 들어가는 부분을 구분

In [68]:
re.split(('\n|\r\n'), tmpString)

['BLT', 'Old Oak Tap', 'Read more ']

In [69]:
re.split(('\n|\r\n'), tmpString)[0]

'BLT'

In [70]:
re.split(('\n|\r\n'), tmpString)[1]

'Old Oak Tap'

In [71]:
listOfsoup[10].find("a")["href"]

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Lula-Cafe-Ham-and-Raclette-Panino/'

In [120]:
rank = []
mainMenu = []
cafeName = []
urlAdd   = []

for item in listOfsoup:
    rank.append(item.find('div', 'sammyRank').get_text())
    
    tmpString = str(item.find("a").get_text())
#     print(tmpString)
    mainMenu.append(re.split(('\n|\r\n'), tmpString)[0])
    cafeName.append(re.split(('\n|\r\n'), tmpString)[1])
    
    urlAdd.append(item.find("a")["href"])

In [73]:
rank[:5]

['1', '2', '3', '4', '5']

In [74]:
mainMenu[:5]

['BLT', 'Fried Bologna', 'Woodland Mushroom', 'Roast Beef', 'PB&L']

In [75]:
cafeName[:5]

['Old Oak Tap', 'Au Cheval', 'Xoco', 'Al’s Deli', 'Publican Quality Meats']

In [76]:
urlAdd[:10]

['/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Als-Deli-Roast-Beef/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Publican-Quality-Meats-PB-L/',
 'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Hendrickx-Belgian-Bread-Crafter-Belgian-Chicken-Curry-Salad/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Acadia-Lobster-Roll/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Birchwood-Kitchen-Smoked-Salmon-Salad/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Cemitas-Puebla-Atomica-Cemitas/',
 '/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Nana-Grilled-Laughing-Bird-Shrimp-and-Fried-Oyster-Po-Boy/']

In [45]:
urlAdd[5][:4]

'http'

In [46]:
rank = []

mainMenu = []
cafeName = []
urlAdd   = []

for item in listOfsoup:
    rank.append(item.find('div', 'sammyRank').get_text())
    
    tmpString = str(item.find("a").get_text())
    tmp = [tmpLine for tmpLine in re.split(('\n|\r\n'), tmpString)]
    mainMenu.append(tmp[0])
    cafeName.append(tmp[1])
    
    tmp2 = item.find("a")["href"]
    if tmp2[:4] != 'http':
        urlAdd.append('http://www.chicagomag.com'+item.find("a")["href"])
    else :
        urlAdd.append(item.find("a")["href"])

### # 가져온 데이터를 데이터프레임으로 정라한 후, 파일로 저장하기

In [47]:
import pandas as pd

data = {'Rank':rank, 'Menu':mainMenu, 'Cafe':cafeName, 'URL':urlAdd}
df = pd.DataFrame(data)

In [48]:
df.head(10)

Unnamed: 0,Rank,Menu,Cafe,URL
0,1,BLT,Old Oak Tap,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Fried Bologna,Au Cheval,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Woodland Mushroom,Xoco,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Roast Beef,Al’s Deli,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,PB&L,Publican Quality Meats,http://www.chicagomag.com/Chicago-Magazine/Nov...
5,6,Belgian Chicken Curry Salad,Hendrickx Belgian Bread Crafter,https://www.chicagomag.com/Chicago-Magazine/No...
6,7,Lobster Roll,Acadia,http://www.chicagomag.com/Chicago-Magazine/Nov...
7,8,Smoked Salmon Salad,Birchwood Kitchen,http://www.chicagomag.com/Chicago-Magazine/Nov...
8,9,Atomica Cemitas,Cemitas Puebla,http://www.chicagomag.com/Chicago-Magazine/Nov...
9,10,Grilled Laughing Bird Shrimp and Fried Po’ Boy,Nana,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [49]:
df = pd.DataFrame(data, columns=['Rank','Cafe','Menu','URL'])
df.head(5)

Unnamed: 0,Rank,Cafe,Menu,URL
0,1,Old Oak Tap,BLT,http://www.chicagomag.com/Chicago-Magazine/Nov...
1,2,Au Cheval,Fried Bologna,http://www.chicagomag.com/Chicago-Magazine/Nov...
2,3,Xoco,Woodland Mushroom,http://www.chicagomag.com/Chicago-Magazine/Nov...
3,4,Al’s Deli,Roast Beef,http://www.chicagomag.com/Chicago-Magazine/Nov...
4,5,Publican Quality Meats,PB&L,http://www.chicagomag.com/Chicago-Magazine/Nov...


In [50]:
df.to_csv('data/chicagomag_info.csv', sep=',', encoding='UTF-8')

In [51]:
%ls data\c*.csv

 C 드라이브의 볼륨에는 이름이 없습니다.
 볼륨 일련 번호: 78D5-6279

 C:\Users\student\Anaconda_src\S11_웹크롤링\data 디렉터리

2022-06-09  오후 12:45             7,715 chicagomag_info.csv
2022-06-08  오후 03:08            10,444 chicagomag_info_v2.csv
               2개 파일              18,159 바이트
               0개 디렉터리  204,804,837,376 바이트 남음
