# HTTP 

In [1]:
from urllib import robotparser

In [2]:
robot = robotparser.RobotFileParser()
robot.set_url("https://www.google.com/robots.txt")
robot.read()
robot.can_fetch('*', '/search/')

False

In [3]:
from urllib import request

resp = request.urlopen("https://www.google.com")
resp.geturl() # response header에 있는 url
resp.reason  # 받았으면 ok, 못받으면 message 출력 / not found
resp.getcode() # 200 을 받아야 제대로 응답 / 404
print(resp.info()) # meta 정보들

resp.getheaders() # response의 header 출력

Date: Fri, 12 Jul 2019 01:08:25 GMT
Expires: -1
Cache-Control: private, max-age=0
Content-Type: text/html; charset=ISO-8859-1
P3P: CP="This is not a P3P policy! See g.co/p3phelp for more info."
Server: gws
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Set-Cookie: 1P_JAR=2019-07-12-01; expires=Sun, 11-Aug-2019 01:08:25 GMT; path=/; domain=.google.com
Set-Cookie: NID=187=RDAe-Fh6fQrZ4jJSWZTX0P_lqi3M3g6orIHKGWyz_1WwLrYsF8de1NZsEbq4O1UqZ3uXzyCMC8-WADKfYTGkpVmbUt_1v6DvC6dpOwtKk6VxhjuOECkCEzekkh_VmKUSoeq5yv-Wyc5H_QNwXBCxTCKrYTGL-ZpuEaEB012dCDQ; expires=Sat, 11-Jan-2020 01:08:25 GMT; path=/; domain=.google.com; HttpOnly
Alt-Svc: quic=":443"; ma=2592000; v="46,43,39"
Accept-Ranges: none
Vary: Accept-Encoding
Connection: close




[('Date', 'Fri, 12 Jul 2019 01:08:25 GMT'),
 ('Expires', '-1'),
 ('Cache-Control', 'private, max-age=0'),
 ('Content-Type', 'text/html; charset=ISO-8859-1'),
 ('P3P', 'CP="This is not a P3P policy! See g.co/p3phelp for more info."'),
 ('Server', 'gws'),
 ('X-XSS-Protection', '0'),
 ('X-Frame-Options', 'SAMEORIGIN'),
 ('Set-Cookie',
  '1P_JAR=2019-07-12-01; expires=Sun, 11-Aug-2019 01:08:25 GMT; path=/; domain=.google.com'),
 ('Set-Cookie',
  'NID=187=RDAe-Fh6fQrZ4jJSWZTX0P_lqi3M3g6orIHKGWyz_1WwLrYsF8de1NZsEbq4O1UqZ3uXzyCMC8-WADKfYTGkpVmbUt_1v6DvC6dpOwtKk6VxhjuOECkCEzekkh_VmKUSoeq5yv-Wyc5H_QNwXBCxTCKrYTGL-ZpuEaEB012dCDQ; expires=Sat, 11-Jan-2020 01:08:25 GMT; path=/; domain=.google.com; HttpOnly'),
 ('Alt-Svc', 'quic=":443"; ma=2592000; v="46,43,39"'),
 ('Accept-Ranges', 'none'),
 ('Vary', 'Accept-Encoding'),
 ('Connection', 'close')]

In [4]:
print(resp.getcode())
print(resp.reason)

200
OK


In [5]:
resp = request.urlopen('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
resp.code, resp.reason
# 헤더를 지정하지 않아서 google에서 frobidden

HTTPError: HTTP Error 403: Forbidden

## error handling

In [6]:
from urllib import error

In [7]:
try:
    resp = request.urlopen('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
except error.HTTPError as e:
        print(e.code, e.reason, e.headers)

403 Forbidden Content-Type: text/html; charset=UTF-8
Date: Fri, 12 Jul 2019 01:10:54 GMT
Server: gws
Cache-Control: private
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN
Alt-Svc: quic=":443"; ma=2592000; v="46,43,39"
Accept-Ranges: none
Vary: Accept-Encoding
Connection: close




## User-Agent
bot 이 아니라 클라이언트가 접속한 것 처럼 보이게 해야 한다.

user-agent를 수정한다

In [8]:
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"}

In [9]:
try:
    req = request.Request('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&', headers=header)
    resp = request.urlopen(req)
except error.HTTPError as e:
    print(e.code, e.reason, e.headers)

In [10]:
resp.code, resp.reason, req.headers

(200,
 'OK',
 {'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'})

In [11]:
tmp = resp.read().decode('utf-8')


In [12]:
from urllib import parse

In [13]:
parse.urlparse('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&')
parse.urljoin('https://www.google.com/search?q=%EB%B0%95%EB%B3%B4%EC%98%81&', '/search/about')
parse.urlencode({'q':'파이썬'})
parse.quote_plus('파이썬')
parse.unquote_plus('%EB%B0%95%EB%B3%B4%EC%98%81')              

'박보영'

In [14]:
import requests

In [15]:
url = 'http://httpbin.org/get'
param = {'key':'value'}
resp = requests.request('GET', url, params=param)

In [16]:
print(resp.text)

{
  "args": {
    "key": "value"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.20.0"
  }, 
  "origin": "163.152.3.135, 163.152.3.135", 
  "url": "https://httpbin.org/get?key=value"
}



In [17]:
url = 'http://httpbin.org/post'
param = {'key':'value'}
resp = requests.request('post', url, data=param)
print(resp.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "key": "value"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Content-Length": "9", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.20.0"
  }, 
  "json": null, 
  "origin": "163.152.3.135, 163.152.3.135", 
  "url": "https://httpbin.org/post"
}



In [18]:
import time

In [19]:
def download(method, url, header=None, param=None, data=None, timeout=1, maxretries=3):
    try:
        resp = requests.request(method, url, headers=header, params=param, data=data)
        resp.raise_for_status()
    except requests.exceptions.HTTPError as e:
        if 500 <= e.response.status_code < 600 and maxretries>0:
            time.sleep(timeout)
            print('시도 {}'.format(maxretries))
            download(method, url, header, param, data, timeout, maxretries-1)
        else:
            print(e.response.status_code)
            print(e.response.reason)
    return resp

In [20]:
download('get', 'https://www.google.com', header)

<Response [200]>

In [21]:
resp = download('get', 'https://www.crawler-test.com/status_codes/status_403', header=header)

403
Forbidden


In [22]:
resp = download('get', 'https://www.crawler-test.com/status_codes/status_500', header=header)

시도 3
시도 2
시도 1
500
Internal Server Error


In [23]:
import json
resp = download('get', 'https://www.httpbin.org/get', param={"key":"value"})
obj = json.loads(resp.text)

In [24]:
obj

{'args': {'key': 'value'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Host': 'www.httpbin.org',
  'User-Agent': 'python-requests/2.20.0'},
 'origin': '163.152.3.135, 163.152.3.135',
 'url': 'https://www.httpbin.org/get?key=value'}

# 예제

In [25]:
url = "http://openapi.airkorea.or.kr/openapi/services/rest/ArpltnInforInqireSvc/getCtprvnRltmMesureDnsty"
# C4W9%252Fd8IlGwY%252Bevhcji%252B7%252FvFdLfOATYrbASRCle%252FAfnz%252BnNhefYyUkc1vibsttgJ1vKr8xBTktP%252BjoR35OGHYA%253D%253D
# 으로 자동 인코딩됨. 중복 인코딩된다
# 따라서 역으로 변환해줘야 함

params ={
    "ServiceKey" : "C4W9%2Fd8IlGwY%2Bevhcji%2B7%2FvFdLfOATYrbASRCle%2FAfnz%2BnNhefYyUkc1vibsttgJ1vKr8xBTktP%2BjoR35OGHYA%3D%3D",    
#     "ServiceKey" : requests.compat.unquote("C4W9%2Fd8IlGwY%2Bevhcji%2B7%2FvFdLfOATYrbASRCle%2FAfnz%2BnNhefYyUkc1vibsttgJ1vKr8xBTktP%2BjoR35OGHYA%3D%3D"),
    "sidoName" : "서울",
    "_returnType" : "JSON",
}

In [26]:
resStr = download('get', url, param=params)

In [27]:
resStr.headers

{'Date': 'Fri, 12 Jul 2019 01:10:13 GMT', 'Set-Cookie': 'WMONID=R06EM0Fxuz6; Expires=Sat, 11-Jul-2020 10:11:6 GMT; Path=/, JSESSIONID=2YReY9lc+y7RkRUjP65vzkIL; Path=/openapi', 'Content-Language': 'ko-KR', 'Content-Length': '190', 'Connection': 'close', 'Content-Type': 'text/xml;charset=utf-8'}

In [28]:
resObj = resStr.json()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
[{items['stationName']:items['pm25Value']} for items in resObj['list']]

-------------

In [None]:
## get
url = "https://search.naver.com/search.naver"
params = {'query':'박보영'}
resStr = download('get', url, param=params)

In [None]:
## post
url = "http://www.kyobobook.co.kr/search/SearchCommonMain.jsp"
params = {'vPstrCategory':'TOT', 'vPstrKeyWord':'박보영'.encode('euc-kr'), 'vPplace':'top'}
resStr = download('post', url, data=params)

In [None]:
resStr.request.body

In [None]:
resStr.text.find('아트와')

In [None]:
url = "http://pythonscraping.com/pages/cookies/login.html"
requests.compat.urljoin(url, "welcome.php")
# requests.compat.urlparse(url)

In [None]:
url = "http://pythonscraping.com/pages/cookies/login.html"
url = requests.compat.urljoin(url, "welcome.php")
data = {
    "username" : "asdadaaaas",
    "password" : "password"
}
html = download('post', url, data=data)
pprint(html.text)

In [None]:
session = requests.Session()

In [None]:
html = session.post(requests.compat.urljoin(url, "welcome.php"), data)
html.text

In [None]:
session.post(requests.compat.urljoin(url, "welcome.php"))
html.text

In [None]:
url = "https://cyber.inu.ac.kr/login.php"
url = requests.compat.urljoin(url, "login/index.php")
data = {
    "username" : "201401438",
    "password" : "cksdl951!!"
}
html = download('post', url, data=data)
pprint(html.text)

In [None]:
url = "https://lms.sunde41.net/auth/login"
url = requests.compat.urljoin(url, "/auth/login")
data = {
    "email" : "skarnd9511@hanmail.net",
    "password" : "cksdl95"
}
html = download('post', url, data=data)
pprint(html.text)

-------------------

In [38]:
from html import unescape
from html import escape
import re

In [49]:
url = "https://www.google.com/"
url = requests.compat.urljoin(url, "/search")
param = {
    "q" : "박보영"
}
html = download('get', url, param=param)
html.text

'<!doctype html><html lang="ko"><head><meta charset="UTF-8"><meta content="/images/branding/googleg/1x/googleg_standard_color_128dp.png" itemprop="image"><title>&#48149;&#48372;&#50689; - Google &#44160;&#49353;</title><script nonce="dQz+hXZXD/oROOM7K6Fmdg==">(function(){var a=window.performance;window.start=(new Date).getTime();a:{var b=window;if(a){var c=a.timing;if(c){var d=c.navigationStart,e=c.responseStart;if(e>d&&e<=window.start){window.start=e;b.wsrt=e-d;break a}}a.now&&(b.wsrt=Math.floor(a.now()))}}window.google=window.google||{};google.aft=function(f){f.setAttribute("data-iml",+new Date)};}).call(this);(function(){var c=[],e=0;window.ping=function(b){-1==b.indexOf("&zx")&&(b+="&zx="+(new Date).getTime());var a=new Image,d=e++;c[d]=a;a.onerror=a.onload=a.onabort=function(){delete c[d]};a.src=b};}).call(this);</script><style>body{margin:0 auto;max-width:736px;padding:0 8px}a{color:#1967D2;text-decoration:none;tap-highlight-color:rgba(0,0,0,.1)}a:visited{color:#4B11A8}a:hover{te

In [37]:
result = re.findall('r\B<div class="r"><a href="(.+?)" .+?>\B<h3 .+?>(.+?)</h3>', unescape(html.text))
result

[]

In [52]:
url = "https://search.naver.com/search.naver"
param = {
    "query" : "박보영"
}
html = download('get', url, param=param)
unescape(html.text)

'<!doctype html> <html lang="ko"> <head> <meta charset="utf-8"> <meta name="referrer" content="always">  <meta name="format-detection" content="telephone=no,address=no,email=no"> <meta name="viewport" content="width=device-width,initial-scale=1.0,maximum-scale=2.0"> <meta property="og:title" content="박보영 : 네이버 통합검색"/> <meta property="og:image" content="https://ssl.pstatic.net/sstatic/search/common/og_v3.png"> <meta property="og:description" content="\'박보영\'의 네이버 통합검색 결과입니다."> <meta name="description" lang="ko" content="\'박보영\'의 네이버 통합검색 결과입니다."> <title>박보영 : 네이버 통합검색</title> <link rel="shortcut icon" href="https://ssl.pstatic.net/sstatic/search/favicon/favicon_140327.ico">  <link rel="search" type="application/opensearchdescription+xml" href="https://ssl.pstatic.net/sstatic/search/opensearch-description.https.xml" title="Naver" /><link rel="stylesheet" type="text/css" href="https://ssl.pstatic.net/sstatic/search/pc/css/search1_190711.css"> <link rel="stylesheet" type="text/css" href="h