In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns 
import matplotlib.font_manager as fm 
font_name = fm.FontProperties(fname="C:\\Windows\\Fonts\\malgun.ttf").get_name()
plt.rc("font",family=font_name)
mpl.rcParams["axes.unicode_minus"] = False

# XML

---

In [2]:
import xml.etree.ElementTree as elemTree

In [3]:
"""
XML을 parsing하는 방법 <xml 접근>
______________________
1. xml이 파일로 존재하는 경우 : parse()
2. xml이 문자열 형태로 존재하는 경우 : fromstring()
"""

# 트리 구조로 파악    /  노드 - 텍스트 노드(단말 노드)

'\nXML을 parsing하는 방법 <xml 접근>\n______________________\n1. xml이 파일로 존재하는 경우 : parse()\n2. xml이 문자열 형태로 존재하는 경우 : fromstring()\n'

In [4]:
tree = elemTree.parse("data/users.xml")
tree

<xml.etree.ElementTree.ElementTree at 0x1d90304c308>

In [5]:
# xml이 문자열로 존재하는 경우
xmlstr = """<?xml version="1.0" encoding="utf-8" ?>
<users>
    <user grade="gold">
            <name>Kim Cheol Soo</name>
            <age>25</age>
            <birthday>19940215</birthday>
        </user>
        <user grade="diamond">
            <name>Kim Yoo Mee</name>
            <age>21</age>
            <birthday>19980417</birthday>
        </user>
</users>
"""
tree = elemTree.fromstring(xmlstr)
tree

<Element 'users' at 0x000001D903071098>

In [6]:
# 태그 검색 및 데이터 다루기

user = tree.find("user[1]")
dir(user)   # attrib ,get ,tag
print(user.tag)
print(user.attrib)
print(user.get("grade"))

name = user.find("name")
print(name.text)

age = user.find("age")
print(age.text)

user
{'grade': 'gold'}
gold
Kim Cheol Soo
25


In [7]:
# 여러 개의 태그를 한꺼번에 가져오기 
users = tree.findall("user")
users

for user in users:
    print(user.attrib)
    print(user.find("name").text)

{'grade': 'gold'}
Kim Cheol Soo
{'grade': 'diamond'}
Kim Yoo Mee


In [8]:
tree = elemTree.parse("data/users.xml")
users = tree.getroot()
print(users.tag)
print(users.attrib)

user = users.find("user")
print(user)
print(user.get("grade"))
print(user.keys())
print(user.items())

users
{}
<Element 'user' at 0x000001D903080908>
gold
['grade']
[('grade', 'gold')]


# Json(java Script Object Notation)
---
        - dumps : 데이터를 저장할 때 
        - loads : 데이터를 불러올 때 
        

In [9]:
import json 

In [10]:
j1 ={"name":"홍길동","birth":"0524","age":20}
print(type(j1))
print(j1)

# json형식으로 변환 ( 문자열로 변환)
j2 = json.dumps(j1)                    
print(type(j2))                    
print(j2)

<class 'dict'>
{'name': '홍길동', 'birth': '0524', 'age': 20}
<class 'str'>
{"name": "\ud64d\uae38\ub3d9", "birth": "0524", "age": 20}


In [11]:
json.dumps([1,2,3])

'[1, 2, 3]'

In [12]:
# 파이썬으로 불러오기
j3 = json.loads(j2)
print(type(j3))                    
print(j3)

<class 'dict'>
{'name': '홍길동', 'birth': '0524', 'age': 20}


In [13]:
obj = """
{
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
"""

type(obj)

str

In [14]:
result = json.loads(obj)
type(result)
result["id"]
result["batters"]["batter"][0]["id"]

'1001'

##  웹소스 읽기

In [15]:
from urllib.request import urlopen

In [16]:
# urlopen으로 파일에 저장 

html = urlopen('http://google.com')
print(type(html))
print(html.read())

<class 'http.client.HTTPResponse'>
b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="ko"><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>Google</title><script nonce="IxzPlKwVynEdFD5ANgYo9g==">(function(){window.google={kEI:\'VLLgXoKXK4O-wAOvy6iQBg\',kEXPI:\'0,202123,3,4,32,1151585,5662,730,224,5104,207,3204,10,1226,364,1499,576,241,383,246,5,959,395,196,1496,2,97,1121,330,388,248,4,133,138,145,116,3,22,577,1122750,1197705,444,78,329040,1294,12383,4855,32691,15248,867,19397,9287,9188,8384,4859,1361,9291,3026,4741,11033,1808,4020,978,7931,5297,2054,920,873,1217,1710,1,1264,6430,11306,3221,4517,2778,919,2277,8,85,2711,219,1374,1279,390,1822,530,149,1103,840,518,1521,4258,312,1137,2,2669,1839,184,1777,520,1704,243,245,1984,93,328,1284,16,2927,2247,473,1339,748,1039,3227,2845,7,5599,469,6286,4454,642,2043,1,406,2458,1226,1743,3653,1275,10

In [17]:
# 예외 처리
from urllib.error import HTTPError, URLError


try:
    html = urlopen('http://naver.com/index.jsp')
except HTTPError as err:
    print("Http 에러입니다 : " +str(err))
except URLError as err:
    print("URL 에러입니다. : "+str(err))
else:
    print(html.read())

Http 에러입니다 : HTTP Error 404: Not Found


In [18]:
# 이미지 다운로드  -p.8
from urllib.request import urlretrieve

urlretrieve("https://t1.daumcdn.net/daumtop_chanel/op/20170315064553027.png",
           "data/daum.png")   # urlretrieve(이미지다운받을 주소(이미지주소복사))
print('저장되었습니다')




저장되었습니다


In [19]:
# urlopen으로 이미지다운    (메모리에만 저장 / 파일로 저장할려면 따로 저장)
img = urlopen("https://www.google.com/images/branding/googlelogo/2x/googlelogo_color_92x30dp.png")
img = img.read()

f = open("data/google.png","wb")
f.write(img)
f.close()
print("저장되었습니다")

저장되었습니다


In [20]:
# urlencode
import urllib.parse

api = "https://www.weather.go.kr/weather/forecast/mid-term-rss3.jsp"
value = {"stnId":109}

params = urllib.parse.urlencode(value)   # dict형인 ":" 를 "=" 으로 변환
params

url = api + "?" + params

data = urlopen(url).read()
data
data = data.decode('utf-8')
print(data)

<?xml version="1.0" encoding="utf-8" ?>
<rss version="2.0">
<channel>
<title>기상청 육상 중기예보</title>
<link>http://www.kma.go.kr/weather/forecast/mid-term_02.jsp</link>
<description>기상청 날씨 웹서비스</description>
<language>ko</language>
<generator>기상청</generator>
<pubDate>2020년 06월 10일 (수)요일 18:00</pubDate>
 <item>
<author>기상청</author>
<category>육상중기예보</category>
<title>서울,경기도 육상 중기예보 - 2020년 06월 10일 (수)요일 18:00 발표</title>
<link>http://www.kma.go.kr/weather/forecast/mid-term_02.jsp</link>
<guid>http://www.kma.go.kr/weather/forecast/mid-term_02.jsp</guid>
<description>
	<header>
		<title>서울,경기도 육상중기예보</title>
		<tm>202006101800</tm>
		<wf><![CDATA[○ (강수) 13일(토)은 비가 오겠습니다.<br />○ (기온) 13일(토)~20일(토) 낮 기온은 24~30도로 오늘(10일, 29~35도)보다 낮겠습니다.<br />○ (해상) 서해중부해상 물결은 1.0~2.0m로 일겠습니다.<br />○ (주말전망) 13일(토) 흐리고 비가 오겠고, 14일(일) 오전에는 구름많다가 오후에 맑겠습니다. 아침 기온은 18~20도, 낮 기온은 25~28도의 분포가 되겠습니다.<br /><br />* 13일(토)은 저기압의 영향으로 서울.인천.경기도에는 비가 오겠고, 돌풍과 함께 천둥.번개를 동반한 많은 비가 내릴 가능성도 있겠으니, 앞으로 발표되는 기상정보를 

## BeautifulSoup

In [21]:
from bs4 import BeautifulSoup

In [22]:
page = open('data/test_first.html').read()
print(type(page))

soup = BeautifulSoup(page,"html.parser")
print(soup.prettify())

<class 'str'>
<!DOCTYPE html>
<html>
 <head>
  <title>
   Very Simple HTML Code by PinkWink
  </title>
 </head>
 <body>
  <div>
   <p class="inner-text first-item" id="first">
    Happy PinkWink.
    <a href="http://www.pinkwink.kr" id="pw-link">
     PinkWink
    </a>
   </p>
   <p class="inner-text second-item">
    Happy Data Science.
    <a href="https://www.python.org" id="py-link">
     Python
    </a>
   </p>
  </div>
  <p class="outer-text first-item" id="second">
   <b>
    Data Science is funny.
   </b>
  </p>
  <p class="outer-text">
   <b>
    All I need is Love.
   </b>
  </p>
 </body>
</html>


In [23]:
list(soup.children)
list(soup.children)[0]
list(soup.children)[1]
html = list(soup.children)[2]
html

<html><head>
<title>Very Simple HTML Code by PinkWink</title>
</head>
<body>
<div>
<p class="inner-text first-item" id="first">
                Happy PinkWink.
                <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
</p>
<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>
</div>
<p class="outer-text first-item" id="second">
<b>
                Data Science is funny.
            </b>
</p>
<p class="outer-text">
<b>
                All I need is Love.
            </b>
</p>
</body>
</html>

In [24]:
list(html.children)


[<head>
 <title>Very Simple HTML Code by PinkWink</title>
 </head>,
 '\n',
 <body>
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>
 </body>,
 '\n']

In [25]:
body = list(html.children)[2]
body
html.body
soup.html.body
list(body.children)


['\n',
 <div>
 <p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>
 </div>,
 '\n',
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 '\n',
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>,
 '\n']

In [26]:
# 태그명으로 접근 
soup.find("p")
soup.find_all("p")  

[<p class="inner-text first-item" id="first">
                 Happy PinkWink.
                 <a href="http://www.pinkwink.kr" id="pw-link">PinkWink</a>
 </p>,
 <p class="inner-text second-item">
                 Happy Data Science.
                 <a href="https://www.python.org" id="py-link">Python</a>
 </p>,
 <p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [27]:
#태그명과 클래스명으로 접근
soup.find_all('p',class_="outer-text")

[<p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>,
 <p class="outer-text">
 <b>
                 All I need is Love.
             </b>
 </p>]

In [28]:
#id로 접근
soup.find_all(id="second")

[<p class="outer-text first-item" id="second">
 <b>
                 Data Science is funny.
             </b>
 </p>]

In [29]:
soup.head.next_sibling.next_sibling              #next_sibling / previous_sibling
body.p.next_sibling.next_sibling

<p class="inner-text second-item">
                Happy Data Science.
                <a href="https://www.python.org" id="py-link">Python</a>
</p>

In [30]:
# 데이터(Text Node) 가져오기 
soup.html.get_text()
soup.head.get_text()
soup.title.get_text()
soup.div.get_text()
soup.p.get_text()

for p in soup.find_all("p"):
    print(p.get_text())


                Happy PinkWink.
                PinkWink


                Happy Data Science.
                Python



                Data Science is funny.
            



                All I need is Love.
            



In [31]:
# 속성값에 접근하기
links = soup.find("a")
links
links['href']

links = soup.find_all("a")
for link in links:
    href= link["href"]
    text= link.string
    print(text, "=>",href)

PinkWink => http://www.pinkwink.kr
Python => https://www.python.org


#  실습

In [32]:
# 네이버 환율 정보 가져오기
url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

span = soup.find_all('span',class_='value')
print('달러환율 : ',span[0].get_text())
print('위안화환율 : ',span[3].get_text())
print('엔화환율 : ',span[4].get_text())

# div로 접근할 때 
div = soup.find_all("div",class_= "head_info")
span1 = div[0].find_all("span")
print(span1[0].get_text())

#css select 사용
span2 = soup.select_one("div.head_info > span.value").string  # class 속성 : . / id속성 : # 으로 접근
print(span2)

달러환율 :  1,191.00
위안화환율 :  168.60
엔화환율 :  107.7100
1,191.00
1,191.00


In [33]:
# 윤동주 시인의 작품을 추출
url = 'https://ko.wikisource.org/wiki/%EC%A0%80%EC%9E%90:%EC%9C%A4%EB%8F%99%EC%A3%BC'
page = urlopen(url)
soup=BeautifulSoup(page, "html.parser")

div= soup.find_all("div",class_="mw-parser-output")
ul= div[0].find_all("ul")
ul1=ul[0].find_all("ul")

for i in ul1[0]:
    if i != "\n":
        print(i.string)

서시
자화상
소년
눈 오는 지도
돌아와 보는 밤
병원
새로운 길
간판 없는 거리
태초의 아침
또 태초의 아침
새벽이 올 때까지
무서운 시간
십자가
바람이 불어
슬픈 족속
눈감고 간다
또 다른 고향
길
별 헤는 밤


In [34]:
# 스크랩핑 연습 : http://www.pythonscraping.com/pages/warandpeace.html

In [35]:
url= 'http://www.pythonscraping.com/pages/warandpeace.html'
page = urlopen(url)
soup=BeautifulSoup(page,"html.parser")

# 녹색단어만 골라오기

# green = soup.find_all('span',class_='green')
# for g in green:
#     print(g.string)
    
# css 사용

# green = soup.select('div#text > span.green')
# for g in green:
#     print(g.string)
        
# green = soup.find_all('span',{"class":"green"})
# for g in green:
#     print(g.string)

# h태그
titles = soup.find_all(["h1","h2"])
# print(titles)
print([title.get_text() for title in titles])

# 녹색과 적색 단어 추출
green_red = soup.find_all("span",{"class":{"green","red"}})
print([text for text in green_red])

['War and Peace', 'Chapter 1']
[<span class="red">Well, Prince, so Genoa and Lucca are now just family estates of the
Buonapartes. But I warn you, if you don't tell me that this means war,
if you still try to defend the infamies and horrors perpetrated by
that Antichrist- I really believe he is Antichrist- I will have
nothing more to do with you and you are no longer my friend, no longer
my 'faithful slave,' as you call yourself! But how do you do? I see
I have frightened you- sit down and tell me all the news.</span>, <span class="green">Anna
Pavlovna Scherer</span>, <span class="green">Empress Marya
Fedorovna</span>, <span class="green">Prince Vasili Kuragin</span>, <span class="green">Anna Pavlovna</span>, <span class="green">St. Petersburg</span>, <span class="red">If you have nothing better to do, Count [or Prince], and if the
prospect of spending an evening with a poor invalid is not too
terrible, I shall be very charmed to see you tonight between 7 and 10-
Annette Scherer.</span

In [36]:
url = 'http://www.pythonscraping.com/pages/page3.html'
page = urlopen(url)
soup = BeautifulSoup(page,"html.parser")

# 제목행을 건너 뛰고 나머지 모든 행 추출   # siblings - 나머지다 가져옴
# for tr in soup .find('table',{"id":"giftList"}).tr.next_siblings:
#     print(tr)
    
# 가격 중에  $15.00 수집

td = soup.find("tr",id="gift1").td.next_sibling.next_sibling
td.get_text()

# 이미지에서 $15.00 접근하기 

td1 = soup.find("img",{"src":"../img/gifts/img1.jpg"}).parent.previous_sibling
td1.get_text()

'\n$15.00\n'

In [37]:
# 영화 리뷰
review_list = []
for i in range(1,6):
    target = urlopen("https://movie.daum.net/moviedb/grade?movieId=94484&type=netizen&page={}".format(i))
    soup = BeautifulSoup(target,"html.parser")
    
    reviews = soup.select('p.desc_review')
    for review in reviews:
        str = review.get_text().strip()               #strip():공백제거
        review_list.append(str)

f = open('data/아쿠아맨.txt','w')
for review in review_list:
    f.write(review +"\n")
    
f.close()