# Using urllib & json to get info through API

In [1]:
import urllib.request as urlrequest
import json

## To get the single data point

In [2]:
url_visit = 'https://api.douban.com/v2/movie/26387939'
crawl_content = urlrequest.urlopen(url_visit).read()
print(crawl_content.decode('unicode-escape'))

{"rating": {"max": 10, "average": "9.1", "numRaters": 418652, "min": 0}, "author": [{"name": "涅提·蒂瓦里 Nitesh Tiwari"}], "alt_title": "摔跤吧！爸爸 \/ 我和我的冠军女儿(台)", "image": "https://img3.doubanio.com\/view\/photo\/s_ratio_poster\/public\/p2457983084.jpg", "title": "Dangal", "summary": "马哈维亚（阿米尔·汗 Aamir Khan 饰）曾经是一名前途无量的摔跤运动员，在放弃了职业生涯后，他最大的遗憾就是没有能够替国家赢得金牌。马哈维亚将这份希望寄托在了尚未出生的儿子身上，哪知道妻子接连给他生了两个女儿，取名吉塔（法缇玛·萨那·纱卡 Fatima Sana Shaikh 饰）和巴比塔（桑亚·玛荷塔 Sanya Malhotra 饰）。让马哈维亚没有想到的是，两个姑娘展现出了杰出的摔跤天赋，让他幡然醒悟，就算是女孩，也能够昂首挺胸的站在比赛场上，为了国家和她们自己赢得荣誉。
就这样，在马哈维亚的指导下，吉塔和巴比塔开始了艰苦的训练，两人进步神速，很快就因为在比赛中连连获胜而成为了当地的名人。为了获得更多的机会，吉塔进入了国家体育学院学习，在那里，她将面对更大的诱惑和更多的选择。", "attrs": {"language": ["印地语"], "pubdate": ["2016-12-23(印度)", "2017-05-05(中国大陆)"], "title": ["Dangal"], "country": ["印度"], "writer": ["比于什·古普塔 Piyush Gupta", "施热亚·简 Shreyas Jain"], "director": ["涅提·蒂瓦里 Nitesh Tiwari"], "cast": ["阿米尔·汗 Aamir Khan", "法缇玛·萨那·纱卡 Fatima Sana Shaikh", "桑亚·玛荷塔 Sanya Malhotra", "阿帕尔夏克提·库拉那 Aparshakti Khurana", "沙克希·坦沃 Sakshi Tanwar", "泽伊拉·沃西

In [3]:
json_content = json.loads(crawl_content.decode('utf8'))
json_content['rating']['average']

'9.1'

In [4]:
id = '26387939'
rank = json_content['rating']['average']
with open('douban_movie_rank.txt', 'w') as output:
    output.write("{} {}".format(id, rank))

## To get multiple data

In [7]:
id_list = [26387939, 11803087, 20451290]

with open('douban_movie_rating.txt', 'w') as output:
    for id in id_list:
        url = 'https://api.douban.com/v2/movie/{}'.format(id)
        crawl_content = urlrequest.urlopen(url).read()
        json_content = json.loads(crawl_content.decode('utf8'))
        
        rank = json_content['rating']['average']
        output.write("{} {}\n".format(id, rank))

# Using BeautifulSoup to get infro from html

## Example

In [8]:
from bs4 import BeautifulSoup

In [11]:
html = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
html

' <html><head><title>The Dormouse\'s story</title></head> <body> <p class="title"><b>The Dormouse\'s story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> '

In [13]:
soup = BeautifulSoup(html, 'html.parser')
soup

 <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> </body></html>

In [14]:
print(soup.prettify())

<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ; and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>


In [15]:
soup.title

<title>The Dormouse's story</title>

In [16]:
soup.title.text

"The Dormouse's story"

In [17]:
soup.title.name

'title'

In [18]:
soup.title.string

"The Dormouse's story"

In [19]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [20]:
soup.find_all('a')[0]

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [21]:
soup.find(id='link2')

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

## Get weather data from gorecast.weather.gov

In [22]:
import requests

In [23]:
url = 'http://forecast.weather.gov/MapClick.php?textField1=41.9&textField2=-87.62'

In [58]:
def get_html_text(url):

    try: 
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return 'Error'

In [59]:
c = get_html_text(url)

In [63]:
soup = BeautifulSoup(c, 'html.parser')
soup.find(id='seven-day-forecast-container')

<div id="seven-day-forecast-container"><ul class="list-unstyled" id="seven-day-forecast-list"><li class="forecast-tombstone">
<div class="tombstone-container">
<p class="period-name">Today<br/><br/></p>
<p><img alt="Today: Sunny, with a high near 54. Southwest wind 5 to 10 mph, with gusts as high as 15 mph. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 54. Southwest wind 5 to 10 mph, with gusts as high as 15 mph. "/></p><p class="short-desc">Sunny</p><p class="temp temp-high">High: 54 °F</p></div></li><li class="forecast-tombstone">
<div class="tombstone-container">
<p class="period-name">Tonight<br/><br/></p>
<p><img alt="Tonight: Mostly clear, with a low around 36. West southwest wind around 5 mph becoming calm  after midnight. Winds could gust as high as 10 mph. " class="forecast-icon" src="newimages/medium/nfew.png" title="Tonight: Mostly clear, with a low around 36. West southwest wind around 5 mph becoming calm  after midnight. Winds

In [66]:
print(soup.find(id='seven-day-forecast-container').get_text())



Today
SunnyHigh: 54 °F

Tonight
Mostly ClearLow: 36 °F

Sunday
SunnyHigh: 53 °F

SundayNight
Partly Cloudythen ChanceShowers andBreezyLow: 47 °F

Monday
ChanceShowers andBreezyHigh: 60 °F

MondayNight
Breezy.Showers thenChanceShowersLow: 38 °F

Tuesday
Mostly Sunnyand BreezyHigh: 39 °F

TuesdayNight
Partly Cloudyand Breezythen PartlyCloudyLow: 23 °F

Wednesday
Mostly SunnyHigh: 29 °F


In [70]:
print(soup.find(id='seven-day-forecast-container').prettify())

<div id="seven-day-forecast-container">
 <ul class="list-unstyled" id="seven-day-forecast-list">
  <li class="forecast-tombstone">
   <div class="tombstone-container">
    <p class="period-name">
     Today
     <br/>
     <br/>
    </p>
    <p>
     <img alt="Today: Sunny, with a high near 54. Southwest wind 5 to 10 mph, with gusts as high as 15 mph. " class="forecast-icon" src="newimages/medium/few.png" title="Today: Sunny, with a high near 54. Southwest wind 5 to 10 mph, with gusts as high as 15 mph. "/>
    </p>
    <p class="short-desc">
     Sunny
    </p>
    <p class="temp temp-high">
     High: 54 °F
    </p>
   </div>
  </li>
  <li class="forecast-tombstone">
   <div class="tombstone-container">
    <p class="period-name">
     Tonight
     <br/>
     <br/>
    </p>
    <p>
     <img alt="Tonight: Mostly clear, with a low around 36. West southwest wind around 5 mph becoming calm  after midnight. Winds could gust as high as 10 mph. " class="forecast-icon" src="newimages/medium

In [67]:
forecast = soup.find(id='seven-day-forecast-container').get_text()
forecast.split('\n')

['',
 '',
 'Today',
 'SunnyHigh: 54 °F',
 '',
 'Tonight',
 'Mostly ClearLow: 36 °F',
 '',
 'Sunday',
 'SunnyHigh: 53 °F',
 '',
 'SundayNight',
 'Partly Cloudythen ChanceShowers andBreezyLow: 47 °F',
 '',
 'Monday',
 'ChanceShowers andBreezyHigh: 60 °F',
 '',
 'MondayNight',
 'Breezy.Showers thenChanceShowersLow: 38 °F',
 '',
 'Tuesday',
 'Mostly Sunnyand BreezyHigh: 39 °F',
 '',
 'TuesdayNight',
 'Partly Cloudyand Breezythen PartlyCloudyLow: 23 °F',
 '',
 'Wednesday',
 'Mostly SunnyHigh: 29 °F']

In [78]:
forecast = soup.find(id='seven-day-forecast-container')
forecast.find_all(class_='period-name')

[<p class="period-name">Today<br/><br/></p>,
 <p class="period-name">Tonight<br/><br/></p>,
 <p class="period-name">Sunday<br/><br/></p>,
 <p class="period-name">Sunday<br/>Night</p>,
 <p class="period-name">Monday<br/><br/></p>,
 <p class="period-name">Monday<br/>Night</p>,
 <p class="period-name">Tuesday<br/><br/></p>,
 <p class="period-name">Tuesday<br/>Night</p>,
 <p class="period-name">Wednesday<br/><br/></p>]

In [81]:
forecast.find_all(class_='short-desc')

[<p class="short-desc">Sunny</p>,
 <p class="short-desc">Mostly Clear</p>,
 <p class="short-desc">Sunny</p>,
 <p class="short-desc">Partly Cloudy<br/>then Chance<br/>Showers and<br/>Breezy</p>,
 <p class="short-desc">Chance<br/>Showers and<br/>Breezy</p>,
 <p class="short-desc">Breezy.<br/>Showers then<br/>Chance<br/>Showers</p>,
 <p class="short-desc">Mostly Sunny<br/>and Breezy</p>,
 <p class="short-desc">Partly Cloudy<br/>and Breezy<br/>then Partly<br/>Cloudy</p>,
 <p class="short-desc">Mostly Sunny</p>]

In [82]:
forecast.find_all(class_='temp')

[<p class="temp temp-high">High: 54 °F</p>,
 <p class="temp temp-low">Low: 36 °F</p>,
 <p class="temp temp-high">High: 53 °F</p>,
 <p class="temp temp-low">Low: 47 °F</p>,
 <p class="temp temp-high">High: 60 °F</p>,
 <p class="temp temp-low">Low: 38 °F</p>,
 <p class="temp temp-high">High: 39 °F</p>,
 <p class="temp temp-low">Low: 23 °F</p>,
 <p class="temp temp-high">High: 29 °F</p>]

In [84]:
forecast = soup.find(id='seven-day-forecast-container')
date_list = forecast.find_all(class_='period-name')
desc_list = forecast.find_all(class_='short-desc')
temp_list = forecast.find_all(class_='temp')

In [104]:
date_list[0].text

'Today'

In [108]:
for i in range(len(date_list)):
    date = date_list[i].get_text()
    desc = desc_list[i].get_text()
    temp = temp_list[i].get_text()
    print("{} {} {}".format(date, desc, temp))

Today Sunny High: 54 °F
Tonight Mostly Clear Low: 36 °F
Sunday Sunny High: 53 °F
SundayNight Partly Cloudythen ChanceShowers andBreezy Low: 47 °F
Monday ChanceShowers andBreezy High: 60 °F
MondayNight Breezy.Showers thenChanceShowers Low: 38 °F
Tuesday Mostly Sunnyand Breezy High: 39 °F
TuesdayNight Partly Cloudyand Breezythen PartlyCloudy Low: 23 °F
Wednesday Mostly Sunny High: 29 °F
