# 5分钟使用Python爬取天气数据

爬取网站的步骤：
1. 设定爬取目标
   * 目标网站：2345天气预报网 http://tianqi.2345.com
   * 目标数据：北京2018年全年天气预报数据
2. 分析目标网站
   * 待爬取页面：http://tianqi.2345.com/wea_history/54511.htm
   * 待爬取数据：数据在js里面，http://tianqi.2345.com/t/wea_history/js/201906/54511_201906.js
3. 批量下载js文件
   * 使用requests库实现下载，官网：https://2.python-requests.org//zh_CN/latest/user/quickstart.html
4. 实现返回的javascript解析，得到目标数据
   * 对于javascript的json如何解析？
5. 将结果数据存储
   * 将数据结果存储成csv格式，方便后续数据分析

## 1、构造待爬取的月份列表

In [2]:
# 构造2018全年的月份列表
months = []
for year in (2017,2018,2019):
    for month in range(12):
        months.append("%d%02d"%(year, month+1))

In [3]:
months

['201701',
 '201702',
 '201703',
 '201704',
 '201705',
 '201706',
 '201707',
 '201708',
 '201709',
 '201710',
 '201711',
 '201712',
 '201801',
 '201802',
 '201803',
 '201804',
 '201805',
 '201806',
 '201807',
 '201808',
 '201809',
 '201810',
 '201811',
 '201812',
 '201901',
 '201902',
 '201903',
 '201904',
 '201905',
 '201906',
 '201907',
 '201908',
 '201909',
 '201910',
 '201911',
 '201912']

## 2、构造待爬取的JS的URL列表

In [4]:
todo_urls = [
    f"http://tianqi.2345.com/t/wea_history/js/{month}/54511_{month}.js"
    for month in months
]

In [5]:
todo_urls

['http://tianqi.2345.com/t/wea_history/js/201701/54511_201701.js',
 'http://tianqi.2345.com/t/wea_history/js/201702/54511_201702.js',
 'http://tianqi.2345.com/t/wea_history/js/201703/54511_201703.js',
 'http://tianqi.2345.com/t/wea_history/js/201704/54511_201704.js',
 'http://tianqi.2345.com/t/wea_history/js/201705/54511_201705.js',
 'http://tianqi.2345.com/t/wea_history/js/201706/54511_201706.js',
 'http://tianqi.2345.com/t/wea_history/js/201707/54511_201707.js',
 'http://tianqi.2345.com/t/wea_history/js/201708/54511_201708.js',
 'http://tianqi.2345.com/t/wea_history/js/201709/54511_201709.js',
 'http://tianqi.2345.com/t/wea_history/js/201710/54511_201710.js',
 'http://tianqi.2345.com/t/wea_history/js/201711/54511_201711.js',
 'http://tianqi.2345.com/t/wea_history/js/201712/54511_201712.js',
 'http://tianqi.2345.com/t/wea_history/js/201801/54511_201801.js',
 'http://tianqi.2345.com/t/wea_history/js/201802/54511_201802.js',
 'http://tianqi.2345.com/t/wea_history/js/201803/54511_201803.

## 3、批量下载数据

In [6]:
import requests

datas = []
for url in todo_urls:
    r = requests.get(url)
    if r.status_code!=200:
        raise Exception()
    # 去除javascript前后的字符串，得到一个js格式的JSON
    data = r.text.lstrip("var weather_str=").rstrip(";")
    datas.append(data)

In [7]:
datas[0]

"{city:'北京',tqInfo:[{ymd:'2017-01-01',bWendu:'5℃',yWendu:'-3℃',tianqi:'霾~晴',fengxiang:'南风',fengli:'1-2级',aqi:'450',aqiInfo:'严重污染',aqiLevel:'6'},{ymd:'2017-01-02',bWendu:'7℃',yWendu:'-6℃',tianqi:'晴~霾',fengxiang:'南风',fengli:'1-2级',aqi:'246',aqiInfo:'重度污染',aqiLevel:'5'},{ymd:'2017-01-03',bWendu:'5℃',yWendu:'-5℃',tianqi:'霾',fengxiang:'南风',fengli:'1-2级',aqi:'320',aqiInfo:'严重污染',aqiLevel:'6'},{ymd:'2017-01-04',bWendu:'6℃',yWendu:'-5℃',tianqi:'霾',fengxiang:'北风',fengli:'1-2级',aqi:'360',aqiInfo:'严重污染',aqiLevel:'6'},{ymd:'2017-01-05',bWendu:'2℃',yWendu:'-4℃',tianqi:'霾',fengxiang:'北风',fengli:'1-2级',aqi:'280',aqiInfo:'重度污染',aqiLevel:'5'},{ymd:'2017-01-06',bWendu:'4℃',yWendu:'-2℃',tianqi:'霾',fengxiang:'南风',fengli:'1-2级',aqi:'234',aqiInfo:'重度污染',aqiLevel:'5'},{ymd:'2017-01-07',bWendu:'2℃',yWendu:'-3℃',tianqi:'小雪',fengxiang:'无持续风向',fengli:'微风',aqi:'206',aqiInfo:'重度污染',aqiLevel:'5'},{ymd:'2017-01-08',bWendu:'5℃',yWendu:'-4℃',tianqi:'阴~晴',fengxiang:'北风',fengli:'1-2级',aqi:'59',aqiInfo:'良',aqiLevel:'2'},

## 4、解析JavaScript返回的数据

In [8]:
### 注意，这里的json是javascript格式，不能用标准库json解析
import json
json.loads(datas[0])

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

***介绍模块：demjson***   
地址：https://pypi.org/project/demjson/2.2.4/  
理由：  
    It is especially useful for   
    error checking   
    or ***for parsing JavaScript data***   
    which may not strictly be valid JSON data.

In [10]:
!pip install demjson

Collecting demjson
[?25l  Downloading https://files.pythonhosted.org/packages/96/67/6db789e2533158963d4af689f961b644ddd9200615b8ce92d6cad695c65a/demjson-2.2.4.tar.gz (131kB)
[K     |████████████████████████████████| 133kB 875kB/s eta 0:00:01
[?25hBuilding wheels for collected packages: demjson
  Building wheel for demjson (setup.py) ... [?25ldone
[?25h  Created wheel for demjson: filename=demjson-2.2.4-cp37-none-any.whl size=73545 sha256=65de7770e49d3e3b8f38437a742798bb34bee5b9978530619cac2cc66ac27394
  Stored in directory: /Users/peishuaishuai/Library/Caches/pip/wheels/c5/d2/ab/a54fb5ea53ac3badba098160e8452fa126a51febda80440ded
Successfully built demjson
Installing collected packages: demjson
Successfully installed demjson-2.2.4


In [11]:
import demjson

In [12]:
demjson.decode(datas[0])

{'city': '北京',
 'tqInfo': [{'ymd': '2017-01-01',
   'bWendu': '5℃',
   'yWendu': '-3℃',
   'tianqi': '霾~晴',
   'fengxiang': '南风',
   'fengli': '1-2级',
   'aqi': '450',
   'aqiInfo': '严重污染',
   'aqiLevel': '6'},
  {'ymd': '2017-01-02',
   'bWendu': '7℃',
   'yWendu': '-6℃',
   'tianqi': '晴~霾',
   'fengxiang': '南风',
   'fengli': '1-2级',
   'aqi': '246',
   'aqiInfo': '重度污染',
   'aqiLevel': '5'},
  {'ymd': '2017-01-03',
   'bWendu': '5℃',
   'yWendu': '-5℃',
   'tianqi': '霾',
   'fengxiang': '南风',
   'fengli': '1-2级',
   'aqi': '320',
   'aqiInfo': '严重污染',
   'aqiLevel': '6'},
  {'ymd': '2017-01-04',
   'bWendu': '6℃',
   'yWendu': '-5℃',
   'tianqi': '霾',
   'fengxiang': '北风',
   'fengli': '1-2级',
   'aqi': '360',
   'aqiInfo': '严重污染',
   'aqiLevel': '6'},
  {'ymd': '2017-01-05',
   'bWendu': '2℃',
   'yWendu': '-4℃',
   'tianqi': '霾',
   'fengxiang': '北风',
   'fengli': '1-2级',
   'aqi': '280',
   'aqiInfo': '重度污染',
   'aqiLevel': '5'},
  {'ymd': '2017-01-06',
   'bWendu': '4℃',
   'yWen

In [13]:
tqInfos = demjson.decode(datas[0])["tqInfo"]

In [14]:
tqInfos

[{'ymd': '2017-01-01',
  'bWendu': '5℃',
  'yWendu': '-3℃',
  'tianqi': '霾~晴',
  'fengxiang': '南风',
  'fengli': '1-2级',
  'aqi': '450',
  'aqiInfo': '严重污染',
  'aqiLevel': '6'},
 {'ymd': '2017-01-02',
  'bWendu': '7℃',
  'yWendu': '-6℃',
  'tianqi': '晴~霾',
  'fengxiang': '南风',
  'fengli': '1-2级',
  'aqi': '246',
  'aqiInfo': '重度污染',
  'aqiLevel': '5'},
 {'ymd': '2017-01-03',
  'bWendu': '5℃',
  'yWendu': '-5℃',
  'tianqi': '霾',
  'fengxiang': '南风',
  'fengli': '1-2级',
  'aqi': '320',
  'aqiInfo': '严重污染',
  'aqiLevel': '6'},
 {'ymd': '2017-01-04',
  'bWendu': '6℃',
  'yWendu': '-5℃',
  'tianqi': '霾',
  'fengxiang': '北风',
  'fengli': '1-2级',
  'aqi': '360',
  'aqiInfo': '严重污染',
  'aqiLevel': '6'},
 {'ymd': '2017-01-05',
  'bWendu': '2℃',
  'yWendu': '-4℃',
  'tianqi': '霾',
  'fengxiang': '北风',
  'fengli': '1-2级',
  'aqi': '280',
  'aqiInfo': '重度污染',
  'aqiLevel': '5'},
 {'ymd': '2017-01-06',
  'bWendu': '4℃',
  'yWendu': '-2℃',
  'tianqi': '霾',
  'fengxiang': '南风',
  'fengli': '1-2级',
  '

In [15]:
# 解析所有月份的数据
all_datas = []

for data in datas:
    tqInfos = demjson.decode(data)["tqInfo"]
    all_datas.extend([x for x in tqInfos if len(x)>0])

In [16]:
len(all_datas)

1095

## 5、将结果写出到csv文件

In [17]:
all_datas[0]

{'ymd': '2017-01-01',
 'bWendu': '5℃',
 'yWendu': '-3℃',
 'tianqi': '霾~晴',
 'fengxiang': '南风',
 'fengli': '1-2级',
 'aqi': '450',
 'aqiInfo': '严重污染',
 'aqiLevel': '6'}

In [18]:
all_datas[0].keys()

dict_keys(['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel'])

In [19]:
import csv
with open('./beijing_tianqi_2017-2019.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    
    columns = ['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel']
    writer.writerow(columns)
    
    for data in all_datas:
        writer.writerow([data[column] for column in columns])