# Using urllib & json to get info through API

In [1]:
import urllib.request as urlrequest
import json

## To get the single data point

In [2]:
url_visit = 'https://api.douban.com/v2/movie/26387939'
crawl_content = urlrequest.urlopen(url_visit).read()
print(crawl_content.decode('unicode-escape'))

{"rating": {"max": 10, "average": "9.1", "numRaters": 418652, "min": 0}, "author": [{"name": "涅提·蒂瓦里 Nitesh Tiwari"}], "alt_title": "摔跤吧！爸爸 \/ 我和我的冠军女儿(台)", "image": "https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2457983084.jpg", "title": "Dangal", "summary": "马哈维亚（阿米尔·汗 Aamir Khan 饰）曾经是一名前途无量的摔跤运动员，在放弃了职业生涯后，他最大的遗憾就是没有能够替国家赢得金牌。马哈维亚将这份希望寄托在了尚未出生的儿子身上，哪知道妻子接连给他生了两个女儿，取名吉塔（法缇玛·萨那·纱卡 Fatima Sana Shaikh 饰）和巴比塔（桑亚·玛荷塔 Sanya Malhotra 饰）。让马哈维亚没有想到的是，两个姑娘展现出了杰出的摔跤天赋，让他幡然醒悟，就算是女孩，也能够昂首挺胸的站在比赛场上，为了国家和她们自己赢得荣誉。
就这样，在马哈维亚的指导下，吉塔和巴比塔开始了艰苦的训练，两人进步神速，很快就因为在比赛中连连获胜而成为了当地的名人。为了获得更多的机会，吉塔进入了国家体育学院学习，在那里，她将面对更大的诱惑和更多的选择。", "attrs": {"language": ["印地语"], "pubdate": ["2016-12-23(印度)", "2017-05-05(中国大陆)"], "title": ["Dangal"], "country": ["印度"], "writer": ["比于什·古普塔 Piyush Gupta", "施热亚·简 Shreyas Jain"], "director": ["涅提·蒂瓦里 Nitesh Tiwari"], "cast": ["阿米尔·汗 Aamir Khan", "法缇玛·萨那·纱卡 Fatima Sana Shaikh", "桑亚·玛荷塔 Sanya Malhotra", "阿帕尔夏克提·库拉那 Aparshakti Khurana", "沙克希·坦沃 Sakshi Tanwar", "泽伊拉·沃西

In [3]:
json_content = json.loads(crawl_content.decode('utf8'))
json_content['rating']['average']

'9.1'

In [4]:
id = '26387939'
rank = json_content['rating']['average']
with open('douban_movie_rank.txt', 'w') as output:
    output.write("{} {}".format(id, rank))

## To get multiple data

In [7]:
id_list = [26387939, 11803087, 20451290]

with open('douban_movie_rating.txt', 'w') as output:
    for id in id_list:
        url = 'https://api.douban.com/v2/movie/{}'.format(id)
        crawl_content = urlrequest.urlopen(url).read()
        json_content = json.loads(crawl_content.decode('utf8'))
        
        rank = json_content['rating']['average']
        output.write("{} {}\n".format(id, rank))

# Using BeautifulSoup to get infro from html

In [8]:
from bs4 import BeautifulSoup

In [11]:
html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
html

' <html><head><title>The Dormouse\'s story</title></head> <body> <p class="title"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> '

In [13]:
soup = BeautifulSoup(html, 'html.parser')
soup

 <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body></html>

In [14]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ; and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [15]:
soup.title

<title>The Dormouse's story</title>

In [16]:
soup.title.text

"The Dormouse's story"

In [17]:
soup.title.name

'title'

In [18]:
soup.title.string

"The Dormouse's story"

In [19]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [20]:
soup.find_all('a')[0]

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [21]:
soup.find(id='link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>