## 연결

In [None]:
from urllib.request import urlopen

html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

### 요청한 html파일 하나 읽기

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://namu.wiki/RecentChanges')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)


### 오류 처리

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://namu.wiki/RecentChanges")
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen("https://pythonscrapingthisurldoesnotexist.com")
except HTTPError as e:
    print("The server returned an HTTP error")
except URLError as e:
    print("The server could not be found!")
else:
    print(html.read())

In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup


def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1   #태그명?
    except AttributeError as e:
        return None
    return title


title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)


In [None]:
#BeautifulSoup
#속성을 통해 태그 검색 
#태그목록 다루기
#트리 내비게이션 분석하기 

### find() findAll()

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html, "html.parser")


In [None]:
nameList = bs.findAll('span', {'class': 'green'})   #attribute
for name in nameList:
    print(name.get_text())

In [None]:
titles = bs.find_all(['h1', 'h2','h3','h4','h5','h6'])
print([title for title in titles])

In [None]:
allText = bs.find_all('span', {'class':{'green', 'red'}})
print([text for text in allText])

In [None]:
nameList = bs.findAll(text="the prince") 
print(len(nameList)) 


In [None]:
allText = bs.findAll(id="text") 
print(allText) 

In [None]:
# allText = bs.findAll(id="text") 와 동일
allText = bs.findAll( '',{'id':"text"}) 
print(allText) 

In [None]:
allText = bs.findAll(id="text") 
print(allText[0].get_text()) 


In [None]:
allText = bs.select_one("#text > span") 
print(allText) 

In [None]:
allText = bs.select_one("#text > span") 
print(allText.get_text()) 

In [None]:
allText = bs.select("#text > span") 
for i in allText:
    print(i.get_text()) 

### 트리구조

tr 태그는 table 태그의 children
descendants 는 몇단계든 그 아래에 존재

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for child in bs.find('table',{'id':'giftList'}).children:   #table 중 아이디가 기프트리스트의 children
    print(child)

#tr 값을 읽어오는 거야



In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for child in bs.find('table',{'id':'giftList'}).descendants:
    print(child)

#descendants 로 읽어오면 children 하고 뭐가 다를까?
#children 은 table 바로 밑을 읽어와
#descendants 는 그 밑에 전부~

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for sibling in bs.find('table', {'id':'giftList'}).tr.next_siblings:
    print(sibling)

#siblings 는 tr 의 동일한 레벨들 읽어오기
# td 읽어오기

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.find('img',
              {'src':'../img/gifts/img1.jpg'})
      .parent.previous_sibling.get_text())

#부모의 사촌형제 찾기래

### 정규표현식

In [None]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('\.\.\/img\/gifts/img.*\.jpg')})
for image in images: 
    print(image['src'])

#src 는 이미지 주소
#*는 아무거나 들어감
#이 식에 맞는 애들만 골라주세요

In [None]:
# 문자열 처리와 정규표현식
fox = "tHe qUICk bROWn fOx." 
print(fox.upper()) 
print(fox.lower())
print(fox.title())
line = ' this is the content ' 
print(line.strip()) #시작과 끝의 공백 삭제
num = "000000000000435"
print(num.strip('0')) #시작과 끝의 공백 대신 해당문자 삭제

import re  

text = """100 John    PROF
101 James   STUD
102 Mac   STUD"""  

print(re.split('\s+', text))  # 공백으로 split
print(re.findall('\d+',text))  # 숫자 찾기  
print(re.findall('[A-Z][a-z]+',text)) # 대소문자 단어
print(re.findall('[A-Z]{2,}',text)) # 대문자 단어 {2,} 2번이상 반복
message = ' 민원실 전화번호는 02-730-5800 입니다. 또는 111-1111-1111로 연락바랍니다'
phone_num = re.compile(r'\d{2,3}-\d{3,4}-\d{4}')
for i in phone_num.findall(message):
    print(i)

### 네이버 크롤링

## 원하는 html 요소가 어디있는지 찾아주는 inspector 기능 사용법 
구글 개발자 도구를 사용
* 크롬 브라우저를 열고 "https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC" 
* F12 버튼을 클릭
* 찾고자 하는 요소를 선택
* 구글 개발자 도구 상단 가장 왼쪽의 inspector 클릭
* 구글 개발자 도구가 찾고자 하는 요소의 html을 찾아줍니다.
* 해당 요소 html에 오른쪽 클릭을 한 후 Copy -> Copy Selector 를 선택해 줍니다.
* 클립보드에 css 선택자가 복사되었습니다.
* 코드에 적용합니다.
* 텍스트만 뽑아오고 싶다면 get_text() 함수를 이용하면 됩니다.

그리고 f12 버튼을 눌러보세요


In [None]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError
from bs4 import BeautifulSoup

def getText(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bs = BeautifulSoup(html, 'html.parser')
        # text = soup.select_one('#s_content > div.section > ul > li:nth-child(1) > dl > dt > a')
        text = bs.select_one('#s_content > div.section > ul > li:nth-of-type(1) > dl > dt > a')
    except AttributeError as e:
        return None
    return text


alltext = getText("https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC")
if alltext == None:
    print("Text could not be found")
else:
    print(alltext.get_text(),alltext.attrs['href'])


In [None]:
import requests
from bs4 import BeautifulSoup

url = 'https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC'

response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    # alltext = soup.select_one('#s_content > div.section > ul > li:nth-child(1) > dl > dt > a')
    alltext = soup.select_one('#s_content > div.section > ul > li:nth-of-type(1) > dl > dt > a')
    print(alltext.get_text(),alltext.attrs['href'])
else : 
    print(response.status_code)

##예시

타이틀

In [None]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC'

response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    alltext = soup.find('title')
    print('**',alltext.get_text())
else : 
        print(response.status_code)

여러건 태그 가져오기

In [None]:
import requests
from bs4 import BeautifulSoup
import re

url = 'https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC'

response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    atitle = soup.find('title')
    ul = soup.select_one('ul.basic1')
    for i in ul.select('li > dl > dt > a'):
        print('--',i.get_text(),i.attrs['href'])
    for i in ul.select('li > dl > dd'):
        if ('파이썬' in i.get_text()) and (i.find('b')):
            print('**',i.get_text())
else : 
        print(response.status_code)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
a = []
b = []
c = [] 
url = 'https://kin.naver.com/search/list.nhn?query=%ED%8C%8C%EC%9D%B4%EC%8D%AC'

response = requests.get(url)

if response.status_code == 200:
    html = response.text
    soup = BeautifulSoup(html, 'html.parser')
    atitle = soup.find('title')
    ul = soup.select_one('ul.basic1')
    for i in ul.select('li > dl > dt > a'):
        print('--',i.get_text(),i.attrs['href'])
        a.append(i.get_text())
        c.append(i.attrs['href'])
    for i in ul.select('li > dl > dd'):
        if ('파이썬' in i.get_text()) and (i.find('b')):
            print('**',i.get_text())
            b.append(i.get_text())
else : 
        print(response.status_code)

df = pd.DataFrame(list(zip(a,b,c)), columns = ['질문' , '내용', '주소'])
df

검색어 입력 받기

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

a = []
b = []
c = []
url = 'https://kin.naver.com/search/list.nhn?query='
i_text =input('검색어를 입력하세요:')       #&page={페이지}
response = requests.get(url + i_text)     #검색어 받기 두 주소 합친거

if response.status_code == 200:
    html = response.text            #텍스트를 html로 받아서
    soup = BeautifulSoup(html, 'html.parser')   #Beautiful soup 에 구문분석
    atitle = soup.find('title')
    ul = soup.select_one('ul.basic1')
    for i in ul.select('li > dl > dt > a'):
        print('--',i.get_text(),i.attrs['href'])
        a.append(i.get_text())
        c.append(i.attrs['href'])
    for i in ul.select('li > dl > dd'):
        if (i_text in i.get_text()) and (i.find('b')):  #만약 Python 이라는 string 대신 i_text
            print('**',i.get_text())        #페이지수 검색어도 무한으로 찾을 수 있어
            b.append(i.get_text())
   
else : 
        print(response.status_code)

df = pd.DataFrame(list(zip(a,b,c)), columns = ['질문' , '내용', '주소'])

df.to_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
df

In [None]:
# 연습하기

import requests
from bs4 import BeautifulSoup
import pandas as pd

a = []
b = []
c = []
url = 'https://kin.naver.com/search/list.nhn?query='
page = '&page='
i_text =input('검색어를 입력하세요:')       #&page={페이지}
p_text = input('페이지 수 입력하세요:')
response = requests.get(url + i_text + page + p_text)     #검색어 받기 두 주소 합친거

if response.status_code == 200:
    html = response.text            #텍스트를 html로 받아서
    soup = BeautifulSoup(html, 'html.parser')   #Beautiful soup 에 구문분석
    atitle = soup.find('title')
    ul = soup.select_one('ul.basic1')
    for i in ul.select('li > dl > dt > a'):
        print('--',i.get_text(),i.attrs['href'])
        a.append(i.get_text())
        c.append(i.attrs['href'])
    for i in ul.select('li > dl > dd'):
        if (i_text in i.get_text()) and (i.find('b')):  #만약 Python 이라는 string 대신 i_text
            print('**',i.get_text())        #페이지수 검색어도 무한으로 찾을 수 있어
            b.append(i.get_text())
   
else : 
        print(response.status_code)

df = pd.DataFrame(list(zip(a,b,c)), columns = ['질문' , '내용', '주소'])

df.to_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
df

In [None]:
# 연습하기

import requests
from bs4 import BeautifulSoup
import pandas as pd

a = []
b = []
c = []
i_text =input('검색어를 입력하세요:')  
page =input('페이지를 입력하세요:')  

url = f'https://kin.naver.com/search/list.nhn?query={i_text}&page={page}'

     #&page={페이지}

response = requests.get(url)     #검색어 받기 두 주소 합친거

if response.status_code == 200:
    html = response.text            #텍스트를 html로 받아서
    soup = BeautifulSoup(html, 'html.parser')   #Beautiful soup 에 구문분석
    atitle = soup.find('title')
    ul = soup.select_one('ul.basic1')
    for i in ul.select('li > dl > dt > a'):
        print('--',i.get_text(),i.attrs['href'])
        a.append(i.get_text())
        c.append(i.attrs['href'])
    for i in ul.select('li > dl > dd'):
        if (i_text in i.get_text()) and (i.find('b')):  #만약 Python 이라는 string 대신 i_text
            print('**',i.get_text())        #페이지수 검색어도 무한으로 찾을 수 있어
            b.append(i.get_text())
   
else : 
        print(response.status_code)

df = pd.DataFrame(list(zip(a,b,c)), columns = ['질문' , '내용', '주소'])

df.to_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")
df

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
<input id="input" type="search" autocomplete="off" spellcheck="false" role="combobox" placeholder="Google 검색 또는 URL 입력" aria-live="polite">
#input