# 5分钟使用Python爬取天气数据

爬取网站的步骤：
1. 设定爬取目标
   * 目标网站：2345天气预报网 http://tianqi.2345.com
   * 目标数据：北京2018年全年天气预报数据
2. 分析目标网站
   * 待爬取页面：http://tianqi.2345.com/wea_history/54511.htm
   * 待爬取数据：数据在js里面，http://tianqi.2345.com/t/wea_history/js/201906/54511_201906.js
3. 批量下载js文件
   * 使用requests库实现下载，官网：https://2.python-requests.org//zh_CN/latest/user/quickstart.html
4. 实现返回的javascript解析，得到目标数据
   * 对于javascript的json如何解析？
5. 将结果数据存储
   * 将数据结果存储成csv格式，方便后续数据分析

## 1、构造待爬取的月份列表

In [1]:
# 构造2018全年的月份列表
year = 2018
months = ["%d%02d"%(year, month+1) for month in range(12)]

In [2]:
months

['201801',
 '201802',
 '201803',
 '201804',
 '201805',
 '201806',
 '201807',
 '201808',
 '201809',
 '201810',
 '201811',
 '201812']

## 2、构造待爬取的JS的URL列表

In [3]:
todo_urls = [
    f"http://tianqi.2345.com/t/wea_history/js/{month}/54511_{month}.js"
    for month in months
]

In [4]:
todo_urls

['http://tianqi.2345.com/t/wea_history/js/201801/54511_201801.js',
 'http://tianqi.2345.com/t/wea_history/js/201802/54511_201802.js',
 'http://tianqi.2345.com/t/wea_history/js/201803/54511_201803.js',
 'http://tianqi.2345.com/t/wea_history/js/201804/54511_201804.js',
 'http://tianqi.2345.com/t/wea_history/js/201805/54511_201805.js',
 'http://tianqi.2345.com/t/wea_history/js/201806/54511_201806.js',
 'http://tianqi.2345.com/t/wea_history/js/201807/54511_201807.js',
 'http://tianqi.2345.com/t/wea_history/js/201808/54511_201808.js',
 'http://tianqi.2345.com/t/wea_history/js/201809/54511_201809.js',
 'http://tianqi.2345.com/t/wea_history/js/201810/54511_201810.js',
 'http://tianqi.2345.com/t/wea_history/js/201811/54511_201811.js',
 'http://tianqi.2345.com/t/wea_history/js/201812/54511_201812.js']

## 3、批量下载数据

In [5]:
import requests

datas = []
for url in todo_urls:
    r = requests.get(url)
    if r.status_code!=200:
        raise Exception()
    # 去除javascript前后的字符串，得到一个js格式的JSON
    data = r.text.lstrip("var weather_str=").rstrip(";")
    datas.append(data)

In [6]:
datas[0]

"{city:'北京',tqInfo:[{ymd:'2018-01-01',bWendu:'3℃',yWendu:'-6℃',tianqi:'晴~多云',fengxiang:'东北风',fengli:'1-2级',aqi:'59',aqiInfo:'良',aqiLevel:'2'},{ymd:'2018-01-02',bWendu:'2℃',yWendu:'-5℃',tianqi:'阴~多云',fengxiang:'东北风',fengli:'1-2级',aqi:'49',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01-03',bWendu:'2℃',yWendu:'-5℃',tianqi:'多云',fengxiang:'北风',fengli:'1-2级',aqi:'28',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01-04',bWendu:'0℃',yWendu:'-8℃',tianqi:'阴',fengxiang:'东北风',fengli:'1-2级',aqi:'28',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01-05',bWendu:'3℃',yWendu:'-6℃',tianqi:'多云~晴',fengxiang:'西北风',fengli:'1-2级',aqi:'50',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01-06',bWendu:'2℃',yWendu:'-5℃',tianqi:'多云~阴',fengxiang:'西南风',fengli:'1-2级',aqi:'32',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01-07',bWendu:'2℃',yWendu:'-4℃',tianqi:'阴~多云',fengxiang:'西南风',fengli:'1-2级',aqi:'59',aqiInfo:'良',aqiLevel:'2'},{ymd:'2018-01-08',bWendu:'2℃',yWendu:'-6℃',tianqi:'晴',fengxiang:'西北风',fengli:'4-5级',aqi:'50',aqiInfo:'优',aqiLevel:'1'},{ymd:'2018-01

## 4、解析JavaScript返回的数据

In [7]:
### 注意，这里的json是javascript格式，不能用标准库json解析
import json
json.loads(datas[0])

JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

***介绍模块：demjson***   
地址：https://pypi.org/project/demjson/2.2.4/  
理由：  
    It is especially useful for   
    error checking   
    or ***for parsing JavaScript data***   
    which may not strictly be valid JSON data.

In [8]:
import demjson

In [9]:
demjson.decode(datas[0])

{'city': '北京',
 'tqInfo': [{'ymd': '2018-01-01',
   'bWendu': '3℃',
   'yWendu': '-6℃',
   'tianqi': '晴~多云',
   'fengxiang': '东北风',
   'fengli': '1-2级',
   'aqi': '59',
   'aqiInfo': '良',
   'aqiLevel': '2'},
  {'ymd': '2018-01-02',
   'bWendu': '2℃',
   'yWendu': '-5℃',
   'tianqi': '阴~多云',
   'fengxiang': '东北风',
   'fengli': '1-2级',
   'aqi': '49',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2018-01-03',
   'bWendu': '2℃',
   'yWendu': '-5℃',
   'tianqi': '多云',
   'fengxiang': '北风',
   'fengli': '1-2级',
   'aqi': '28',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2018-01-04',
   'bWendu': '0℃',
   'yWendu': '-8℃',
   'tianqi': '阴',
   'fengxiang': '东北风',
   'fengli': '1-2级',
   'aqi': '28',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2018-01-05',
   'bWendu': '3℃',
   'yWendu': '-6℃',
   'tianqi': '多云~晴',
   'fengxiang': '西北风',
   'fengli': '1-2级',
   'aqi': '50',
   'aqiInfo': '优',
   'aqiLevel': '1'},
  {'ymd': '2018-01-06',
   'bWendu': '2℃',
   'yWendu': '-5℃'

In [10]:
tqInfos = demjson.decode(datas[0])["tqInfo"]

In [11]:
tqInfos

[{'ymd': '2018-01-01',
  'bWendu': '3℃',
  'yWendu': '-6℃',
  'tianqi': '晴~多云',
  'fengxiang': '东北风',
  'fengli': '1-2级',
  'aqi': '59',
  'aqiInfo': '良',
  'aqiLevel': '2'},
 {'ymd': '2018-01-02',
  'bWendu': '2℃',
  'yWendu': '-5℃',
  'tianqi': '阴~多云',
  'fengxiang': '东北风',
  'fengli': '1-2级',
  'aqi': '49',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2018-01-03',
  'bWendu': '2℃',
  'yWendu': '-5℃',
  'tianqi': '多云',
  'fengxiang': '北风',
  'fengli': '1-2级',
  'aqi': '28',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2018-01-04',
  'bWendu': '0℃',
  'yWendu': '-8℃',
  'tianqi': '阴',
  'fengxiang': '东北风',
  'fengli': '1-2级',
  'aqi': '28',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2018-01-05',
  'bWendu': '3℃',
  'yWendu': '-6℃',
  'tianqi': '多云~晴',
  'fengxiang': '西北风',
  'fengli': '1-2级',
  'aqi': '50',
  'aqiInfo': '优',
  'aqiLevel': '1'},
 {'ymd': '2018-01-06',
  'bWendu': '2℃',
  'yWendu': '-5℃',
  'tianqi': '多云~阴',
  'fengxiang': '西南风',
  'fengli': '1-2级',
  'aqi': 

In [12]:
# 解析所有月份的数据
all_datas = []

for data in datas:
    tqInfos = demjson.decode(data)["tqInfo"]
    all_datas.extend([x for x in tqInfos if len(x)>0])

In [13]:
len(all_datas)

365

## 5、将结果写出到csv文件

In [14]:
all_datas[0]

{'ymd': '2018-01-01',
 'bWendu': '3℃',
 'yWendu': '-6℃',
 'tianqi': '晴~多云',
 'fengxiang': '东北风',
 'fengli': '1-2级',
 'aqi': '59',
 'aqiInfo': '良',
 'aqiLevel': '2'}

In [15]:
all_datas[0].keys()

dict_keys(['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel'])

In [16]:
import csv
with open('./beijing_tianqi_2018.csv', 'w', newline='', encoding='utf-8') as csv_file:
    writer = csv.writer(csv_file)
    
    columns = ['ymd', 'bWendu', 'yWendu', 'tianqi', 'fengxiang', 'fengli', 'aqi', 'aqiInfo', 'aqiLevel']
    writer.writerow(columns)
    
    for data in all_datas:
        writer.writerow([data[column] for column in columns])