## Collecting data from remote server

In [7]:
'''
html파싱객체.find('태그명')
html파싱객체.find_all('태그명')
'''
import urllib.request # for request to remote server
from bs4 import BeautifulSoup #for html parsing

# url to request
url = 'https://www.naver.com/index.html'

# remote server file
res = urllib.request.urlopen(url) #web document request
data = res.read() # read in txt type

# source decoding
src = data.decode('utf-8')
#print(src)

# html parsing
html = BeautifulSoup(src, 'html.parser') # html parsing
#print(html)

# tag info
a = html.find('a')
print('a tag: ', a)
print('a tag info: ', a.string)

a tag:  <a href="#newsstand"><span>뉴스스탠드 바로가기</span></a>
a tag info:  뉴스스탠드 바로가기


## Collecting data from local

In [17]:
file = open('/Users/followthesnake/Desktop/data/date.html', mode='r', encoding='utf-8')

text = file.read()

html = BeautifulSoup(text, 'html.parser')

# using tag
h1 = html.html.body.h1 # 계층 접근
print('h1 : ', h1.string) #.string shows the data wrapped in the tag
print(h1) # shows whole sentence include tag

# find('tag') function
h2 = html.find('h2')
print('h2: ', h2.string)

# find_all('tag')
lis = html.find_all('li')
print('li: ', lis) # since it's multiple tags, '.string' is not proper grammar to use
# result come out as a list

for li in lis :
    print(li.string)

h1 :  시멘틱 태
<h1>시멘틱 태</h1>
h2:  주요 시멘틱 태그
li:  [<li> headr: 문서의 머리말(사이트 소개, 제목, 로그)</li>, <li> nav: 네비게이션(메뉴)</li>, <li> section: 웹 문서를 장(chapter)으로 볼 때 절을 구분하는 태그</li>, <li> aside: 문서의 보조 내용(광고, 즐겨찾기, 링크)</li>, <li> footer: 문서의 꼬리말(작성자, 저작권,개인정보보호)</li>]
 headr: 문서의 머리말(사이트 소개, 제목, 로그)
 nav: 네비게이션(메뉴)
 section: 웹 문서를 장(chapter)으로 볼 때 절을 구분하는 태그
 aside: 문서의 보조 내용(광고, 즐겨찾기, 링크)
 footer: 문서의 꼬리말(작성자, 저작권,개인정보보호)


## Collecting tag properties

In [21]:
from bs4 import BeautifulSoup #for html parsing

# (1) Reading Local File
file = open('/Users/followthesnake/Desktop/data/tagprop.html', mode='r', encoding='utf-8')
source = file.read()

# (2) html parsing
html = BeautifulSoup(source, 'html.parser')

# (3) a tag
links = html.find_all('a') # return list
print('links size = ', len(links))

# (4) Finding property of a tag
for link in links:
    try:
        print(link.attrs['href'])
        print(link.attrs['target'])
    except Exception as e:
        print('Exception: ', e)


# (5) Finding property with Regex
import re
print('Finding property with object pattern')
patt=re.compile('http://')
links = html.find_all(href= patt)

print(links)

links size =  5
www.naver.com
Exception:  'target'
http://www.naver.com
Exception:  'target'
http://www.naver.com
_blank
www.daum.net
Exception:  'target'
http://www.daum.net
Exception:  'target'
Finding property with object pattern
[<a href="http://www.naver.com">Naver</a>, <a href="http://www.naver.com" target="_blank">Naver in New Browser</a>, <a href="http://www.daum.net">Daum</a>]
