# 5分钟使用Python爬取天气数据

爬取网站的步骤：
1. 设定爬取目标
   * 目标网站：2345天气预报网 http://tianqi.2345.com
   * 目标数据：北京2018年全年天气预报数据
2. 分析目标网站
   * 待爬取页面：http://tianqi.2345.com/wea_history/54511.htm
   * 待爬取数据：数据在js里面，http://tianqi.2345.com/t/wea_history/js/201906/54511_201906.js
3. 批量下载js文件
   * 使用requests库实现下载，官网：https://2.python-requests.org//zh_CN/latest/user/quickstart.html
4. 实现返回的javascript解析，得到目标数据
   * 对于javascript的json如何解析？
5. 将结果数据存储
   * 将数据结果存储成csv格式，方便后续数据分析

## 1、构造待爬取的月份列表

In [1]:
# 构造2018全年的月份列表
months = []
for year in (2019,):
    for month in range(12):
        months.append("%d%02d"%(year, month+1))

In [2]:
months

['201901',
 '201902',
 '201903',
 '201904',
 '201905',
 '201906',
 '201907',
 '201908',
 '201909',
 '201910',
 '201911',
 '201912']

## 2、构造待爬取的JS的URL列表

In [3]:
todo_urls = [
    f"http://tianqi.2345.com/t/wea_history/js/{month}/54511_{month}.js"
    for month in months
]

In [4]:
todo_urls

['http://tianqi.2345.com/t/wea_history/js/201901/54511_201901.js',
 'http://tianqi.2345.com/t/wea_history/js/201902/54511_201902.js',
 'http://tianqi.2345.com/t/wea_history/js/201903/54511_201903.js',
 'http://tianqi.2345.com/t/wea_history/js/201904/54511_201904.js',
 'http://tianqi.2345.com/t/wea_history/js/201905/54511_201905.js',
 'http://tianqi.2345.com/t/wea_history/js/201906/54511_201906.js',
 'http://tianqi.2345.com/t/wea_history/js/201907/54511_201907.js',
 'http://tianqi.2345.com/t/wea_history/js/201908/54511_201908.js',
 'http://tianqi.2345.com/t/wea_history/js/201909/54511_201909.js',
 'http://tianqi.2345.com/t/wea_history/js/201910/54511_201910.js',
 'http://tianqi.2345.com/t/wea_history/js/201911/54511_201911.js',
 'http://tianqi.2345.com/t/wea_history/js/201912/54511_201912.js']

## 3、批量下载数据

In [5]:
import requests

datas = []
for url in todo_urls:
    r = requests.get(url)
    if r.status_code!=200:
        raise Exception()
    # 去除javascript前后的字符串，得到一个js格式的JSON
    data = r.text.lstrip("var weather_str=").rstrip(";")
    datas.append(data)

In [6]:
datas[0]

"{city:'北京',tqInfo:[{ymd:'2019-01-01',bWendu:'1℃',yWendu:'-10℃',tianqi:'晴~多云',fengxiang:'西北风',fengli:'1级',aqi:'56',aqiInfo:'良',aqiLevel:'2'},{ymd:'2019-01-02',bWendu:'1℃',yWendu:'-9℃',tianqi:'多云',fengxiang:'东北风',fengli:'1级',aqi:'60',aqiInfo:'良',aqiLevel:'2'},{ymd:'2019-01-03',bWendu:'2℃',yWendu:'-7℃',tianqi:'霾',fengxiang:'东北风',fengli:'1级',aqi:'165',aqiInfo:'中度污染',aqiLevel:'4'},{ymd:'2019-01-04',bWendu:'2℃',yWendu:'-7℃',tianqi:'晴',fengxiang:'西北风',fengli:'2级',aqi:'50',aqiInfo:'优',aqiLevel:'1'},{ymd:'2019-01-05',bWendu:'0℃',yWendu:'-8℃',tianqi:'多云',fengxiang:'东北风',fengli:'2级',aqi:'29',aqiInfo:'优',aqiLevel:'1'},{ymd:'2019-01-06',bWendu:'3℃',yWendu:'-7℃',tianqi:'多云',fengxiang:'东南风',fengli:'1级',aqi:'84',aqiInfo:'良',aqiLevel:'2'},{ymd:'2019-01-07',bWendu:'2℃',yWendu:'-7℃',tianqi:'多云',fengxiang:'西北风',fengli:'2级',aqi:'61',aqiInfo:'良',aqiLevel:'2'},{ymd:'2019-01-08',bWendu:'1℃',yWendu:'-10℃',tianqi:'晴',fengxiang:'西北风',fengli:'2级',aqi:'28',aqiInfo:'优',aqiLevel:'1'},{ymd:'2019-01-09',bWendu:'3℃',y

## 4、解析JavaScript返回的数据

In [7]:
### 注意，这里的json是javascript格式，不能用标准库json解析
import json
json.loads(datas[0])

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

***介绍模块：demjson***   
地址：https://pypi.org/project/demjson/2.2.4/  
理由：  
    It is especially useful for   
    error checking   
    or ***for parsing JavaScript data***   
    which may not strictly be valid JSON data.

In [8]:
!pip install demjson

Collecting demjson
  Using cached demjson-2.2.4.tar.gz (131 kB)
Building wheels for collected packages: demjson
  Building wheel for demjson (setup.py): started
  Building wheel for demjson (setup.py): finished with status 'done'
  Created wheel for demjson: filename=demjson-2.2.4-py3-none-any.whl size=73548 sha256=4f33e9829ebe8ea099891397891f7389028e2131356dc47533472f6ced6d6d90
  Stored in directory: c:\users\administrator\appdata\local\pip\cache\wheels\41\94\3d\466801f4a8db8e6fce765d7a0115dfebcc55ddf6b00cd98f59
Successfully built demjson
Installing collected packages: demjson
Successfully installed demjson-2.2.4


In [9]:
import demjson

In [10]:
demjson.decode(datas[0])

{'city': '北京',
 'tqInfo': [{'ymd': '2019-01-01',
   'bWendu': '1℃',
   'yWendu': '-10℃',
   'tianqi': '晴~多云',
   'fengxiang': '西北风',
   'fengli': '1级',
   'aqi': '56',
   'aqiInfo': '良',
   'aqiLevel': '2'},
  {'ymd': '2019-01-02',
   'bWendu': '1℃',
   'yWendu': '-9℃',
   'tianqi': '多云',
   'fengxiang': '东北风',
   'fengli': '1级',
   'aqi': '60',
   'aqiInfo': '良',
   'aqiLevel': '2'},
  {'ymd': '2019-01-03',
   'bWendu': '2℃',
   'yWendu': '-7℃',
   'tianqi': '霾',
   'fengxiang': '东北风',
   'fengli': '1级',
   'aqi': '165',
   'aqiInfo': '中度污染',
   'aqiLevel': '4'},
  {'ymd': '2019-01-04',
   'bWendu': '2℃',
   'yWendu': '-7℃',
   'tianqi': '晴',
   'fengxiang': '西北风',
   'fengli': '2级',
   'aqi': '50',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2019-01-05',
   'bWendu': '0℃',
   'yWendu': '-8℃',
   'tianqi': '多云',
   'fengxiang': '东北风',
   'fengli': '2级',
   'aqi': '29',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2019-01-06',
   'bWendu': '3℃',
   'yWendu': '-7℃',
   'tia

In [11]:
tqInfos = demjson.decode(datas[0])["tqInfo"]

In [12]:
tqInfos

[{'ymd': '2019-01-01',
  'bWendu': '1℃',
  'yWendu': '-10℃',
  'tianqi': '晴~多云',
  'fengxiang': '西北风',
  'fengli': '1级',
  'aqi': '56',
  'aqiInfo': '良',
  'aqiLevel': '2'},
 {'ymd': '2019-01-02',
  'bWendu': '1℃',
  'yWendu': '-9℃',
  'tianqi': '多云',
  'fengxiang': '东北风',
  'fengli': '1级',
  'aqi': '60',
  'aqiInfo': '良',
  'aqiLevel': '2'},
 {'ymd': '2019-01-03',
  'bWendu': '2℃',
  'yWendu': '-7℃',
  'tianqi': '霾',
  'fengxiang': '东北风',
  'fengli': '1级',
  'aqi': '165',
  'aqiInfo': '中度污染',
  'aqiLevel': '4'},
 {'ymd': '2019-01-04',
  'bWendu': '2℃',
  'yWendu': '-7℃',
  'tianqi': '晴',
  'fengxiang': '西北风',
  'fengli': '2级',
  'aqi': '50',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2019-01-05',
  'bWendu': '0℃',
  'yWendu': '-8℃',
  'tianqi': '多云',
  'fengxiang': '东北风',
  'fengli': '2级',
  'aqi': '29',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2019-01-06',
  'bWendu': '3℃',
  'yWendu': '-7℃',
  'tianqi': '多云',
  'fengxiang': '东南风',
  'fengli': '1级',
  'aqi': '84',
  'aqiI

In [13]:
# 解析所有月份的数据
all_datas = []

for data in datas:
    tqInfos = demjson.decode(data)["tqInfo"]
    all_datas.extend([x for x in tqInfos if len(x)>0])

In [14]:
len(all_datas)

365

## 5、将结果写出到csv文件

In [15]:
all_datas[0]

{'ymd': '2019-01-01',
 'bWendu': '1℃',
 'yWendu': '-10℃',
 'tianqi': '晴~多云',
 'fengxiang': '西北风',
 'fengli': '1级',
 'aqi': '56',
 'aqiInfo': '良',
 'aqiLevel': '2'}

In [16]:
all_datas[0].keys()

dict_keys(['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel'])

In [17]:
import csv
with open('./beijing_tianqi_2019.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    
    columns = ['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel']
    writer.writerow(columns)
    
    for data in all_datas:
        writer.writerow([data[column] for column in columns])